xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 8fd04b8338ed5093ec2d1e668fa620b7de44c177)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip6.h>		/* for ip6_t */
54 #include <inet/tcp.h>		/* for tcph_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/multidata.h>
63 
64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
65 
66 /*
67  * Per-interface tunables (for developers)
68  *
69  * ibd_tx_copy_thresh
70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
72  *     is restricted by various parameters, so setting of this value must be
73  *     made after careful considerations only.  For instance, IB HCAs currently
74  *     impose a relatively small limit (when compared to ethernet NICs) on the
75  *     length of the SGL for transmit. On the other hand, the ip stack could
76  *     send down mp chains that are quite long when LSO is enabled.
77  *
78  * ibd_num_swqe
79  *     Number of "send WQE" elements that will be allocated and used by ibd.
80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
81  *     buffer in each of these send wqes must be taken into account. This
82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
84  *     changed independently if needed).
85  *
86  * ibd_num_rwqe
87  *     Number of "receive WQE" elements that will be allocated and used by
88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
89  *     Each buffer in the receive wqe will be of MTU size.
90  *
91  * ibd_num_lso_bufs
92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
94  *     and too large to be used with regular MTU-sized copy buffers. It is
95  *     not recommended to tune this variable without understanding the
96  *     application environment and/or memory resources. The size of each of
97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
98  *
99  * ibd_num_ah
100  *     Number of AH cache entries to allocate
101  *
102  * ibd_hash_size
103  *     Hash table size for the active AH list
104  *
105  * ibd_tx_softintr
106  * ibd_rx_softintr
107  *     The softintr mechanism allows ibd to avoid event queue overflows if
108  *     the receive/completion handlers are to be expensive. These are enabled
109  *     by default.
110  *
111  * ibd_log_sz
112  *     This specifies the size of the ibd log buffer in bytes. The buffer is
113  *     allocated and logging is enabled only when IBD_LOGGING is defined.
114  *
115  */
116 uint_t ibd_tx_copy_thresh = 0x1000;
117 uint_t ibd_num_swqe = 4000;
118 uint_t ibd_num_rwqe = 4000;
119 uint_t ibd_num_lso_bufs = 0x400;
120 uint_t ibd_num_ah = 256;
121 uint_t ibd_hash_size = 32;
122 uint_t ibd_rx_softintr = 1;
123 uint_t ibd_tx_softintr = 1;
124 uint_t ibd_create_broadcast_group = 1;
125 #ifdef IBD_LOGGING
126 uint_t ibd_log_sz = 0x20000;
127 #endif
128 
129 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
130 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
131 #define	IBD_NUM_SWQE			ibd_num_swqe
132 #define	IBD_NUM_RWQE			ibd_num_rwqe
133 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
134 #define	IBD_NUM_AH			ibd_num_ah
135 #define	IBD_HASH_SIZE			ibd_hash_size
136 #ifdef IBD_LOGGING
137 #define	IBD_LOG_SZ			ibd_log_sz
138 #endif
139 
140 /*
141  * ibd_rc_tx_copy_thresh
142  *     This sets the threshold upto which ibd will attempt to do a bcopy of the
143  *     outgoing data into a pre-mapped buffer.
144  */
145 uint_t ibd_rc_tx_copy_thresh = 0x1000;
146 
147 /*
148  * Receive CQ moderation parameters: tunable (for developers)
149  */
150 uint_t ibd_rxcomp_count = 4;
151 uint_t ibd_rxcomp_usec = 10;
152 
153 /*
154  * Send CQ moderation parameters: tunable (for developers)
155  */
156 uint_t ibd_txcomp_count = 16;
157 uint_t ibd_txcomp_usec = 300;
158 
159 /* Post IBD_RX_POST_CNT receive work requests at a time. */
160 #define	IBD_RX_POST_CNT			8
161 
162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
163 #define	IBD_LOG_RX_POST			4
164 
165 /* Minimum number of receive work requests driver needs to always have */
166 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
167 
168 /*
169  * LSO parameters
170  */
171 #define	IBD_LSO_MAXLEN			65536
172 #define	IBD_LSO_BUFSZ			8192
173 #define	IBD_PROP_LSO_POLICY		"lso-policy"
174 
175 /*
176  * Async operation states
177  */
178 #define	IBD_OP_NOTSTARTED		0
179 #define	IBD_OP_ONGOING			1
180 #define	IBD_OP_COMPLETED		2
181 #define	IBD_OP_ERRORED			3
182 #define	IBD_OP_ROUTERED			4
183 
184 /*
185  * State of IBD driver initialization during attach/m_start
186  */
187 #define	IBD_DRV_STATE_INITIALIZED	0x00001
188 #define	IBD_DRV_RXINTR_ADDED		0x00002
189 #define	IBD_DRV_TXINTR_ADDED		0x00004
190 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
191 #define	IBD_DRV_HCA_OPENED		0x00010
192 #define	IBD_DRV_PD_ALLOCD		0x00020
193 #define	IBD_DRV_MAC_REGISTERED		0x00040
194 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
195 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
196 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
197 #define	IBD_DRV_CQS_ALLOCD		0x00400
198 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
199 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
200 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
201 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
202 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
203 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
204 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
205 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
206 #define	IBD_DRV_STARTED			0x80000
207 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
208 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
209 #define	IBD_DRV_RC_LISTEN		0x400000
210 #ifdef DEBUG
211 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
212 #endif
213 
214 /*
215  * Start/stop in-progress flags; note that restart must always remain
216  * the OR of start and stop flag values.
217  */
218 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
219 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
220 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
221 
222 /*
223  * Miscellaneous constants
224  */
225 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
226 #define	IBD_DEF_MAX_SDU			2044
227 #define	IBD_DEFAULT_QKEY		0xB1B
228 #ifdef IBD_LOGGING
229 #define	IBD_DMAX_LINE			100
230 #endif
231 
232 /*
233  * Enumerations for link states
234  */
235 typedef enum {
236 	IBD_LINK_DOWN,
237 	IBD_LINK_UP,
238 	IBD_LINK_UP_ABSENT
239 } ibd_link_op_t;
240 
241 /*
242  * Driver State Pointer
243  */
244 void *ibd_list;
245 
246 /*
247  * Driver Global Data
248  */
249 ibd_global_state_t ibd_gstate;
250 
251 /*
252  * Logging
253  */
254 #ifdef IBD_LOGGING
255 kmutex_t ibd_lbuf_lock;
256 uint8_t *ibd_lbuf;
257 uint32_t ibd_lbuf_ndx;
258 #endif
259 
260 /*
261  * Required system entry points
262  */
263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
265 
266 /*
267  * Required driver entry points for GLDv3
268  */
269 static int ibd_m_stat(void *, uint_t, uint64_t *);
270 static int ibd_m_start(void *);
271 static void ibd_m_stop(void *);
272 static int ibd_m_promisc(void *, boolean_t);
273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
274 static int ibd_m_unicst(void *, const uint8_t *);
275 static mblk_t *ibd_m_tx(void *, mblk_t *);
276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
277 
278 /*
279  * Private driver entry points for GLDv3
280  */
281 
282 /*
283  * Initialization
284  */
285 static int ibd_state_init(ibd_state_t *, dev_info_t *);
286 static int ibd_init_txlist(ibd_state_t *);
287 static int ibd_init_rxlist(ibd_state_t *);
288 static int ibd_acache_init(ibd_state_t *);
289 #ifdef IBD_LOGGING
290 static void ibd_log_init(void);
291 #endif
292 
293 /*
294  * Termination/cleanup
295  */
296 static void ibd_state_fini(ibd_state_t *);
297 static void ibd_fini_txlist(ibd_state_t *);
298 static void ibd_fini_rxlist(ibd_state_t *);
299 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
300 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
301 static void ibd_acache_fini(ibd_state_t *);
302 #ifdef IBD_LOGGING
303 static void ibd_log_fini(void);
304 #endif
305 
306 /*
307  * Allocation/acquire/map routines
308  */
309 static int ibd_alloc_tx_copybufs(ibd_state_t *);
310 static int ibd_alloc_rx_copybufs(ibd_state_t *);
311 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
312 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
313 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
314     uint32_t *);
315 
316 /*
317  * Free/release/unmap routines
318  */
319 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
320 static void ibd_free_tx_copybufs(ibd_state_t *);
321 static void ibd_free_rx_copybufs(ibd_state_t *);
322 static void ibd_free_rx_rsrcs(ibd_state_t *);
323 static void ibd_free_tx_lsobufs(ibd_state_t *);
324 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
325 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
326 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
327 
328 /*
329  * Handlers/callback routines
330  */
331 static uint_t ibd_intr(caddr_t);
332 static uint_t ibd_tx_recycle(caddr_t);
333 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
334 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
335 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
336 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
337 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
338 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
339 static void ibd_freemsg_cb(char *);
340 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
341     ibt_async_event_t *);
342 static void ibd_snet_notices_handler(void *, ib_gid_t,
343     ibt_subnet_event_code_t, ibt_subnet_event_t *);
344 
345 /*
346  * Send/receive routines
347  */
348 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
349 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
350 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
351 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
352 
353 /*
354  * Threads
355  */
356 static void ibd_async_work(ibd_state_t *);
357 
358 /*
359  * Async tasks
360  */
361 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
362 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
363 static void ibd_async_setprom(ibd_state_t *);
364 static void ibd_async_unsetprom(ibd_state_t *);
365 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
366 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
367 static void ibd_async_txsched(ibd_state_t *);
368 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
369 
370 /*
371  * Async task helpers
372  */
373 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
374 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
375 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
376 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
377     ipoib_mac_t *, ipoib_mac_t *);
378 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
379 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
380 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
381 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
382 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
383 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
384 static uint64_t ibd_get_portspeed(ibd_state_t *);
385 static boolean_t ibd_async_safe(ibd_state_t *);
386 static void ibd_async_done(ibd_state_t *);
387 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
388 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
389 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
390 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
391 
392 /*
393  * Helpers for attach/start routines
394  */
395 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
396 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
397 static int ibd_unattach(ibd_state_t *, dev_info_t *);
398 static int ibd_get_port_details(ibd_state_t *);
399 static int ibd_alloc_cqs(ibd_state_t *);
400 static int ibd_setup_ud_channel(ibd_state_t *);
401 static int ibd_start(ibd_state_t *);
402 static int ibd_undo_start(ibd_state_t *, link_state_t);
403 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
404 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
405 
406 
407 /*
408  * Miscellaneous helpers
409  */
410 static int ibd_sched_poll(ibd_state_t *, int, int);
411 static void ibd_resume_transmission(ibd_state_t *);
412 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
413 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
414 static void *list_get_head(list_t *);
415 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
416 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
417 #ifdef IBD_LOGGING
418 static void ibd_log(const char *, ...);
419 #endif
420 
421 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
422     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
423 
424 /* Module Driver Info */
425 static struct modldrv ibd_modldrv = {
426 	&mod_driverops,			/* This one is a driver */
427 	"InfiniBand GLDv3 Driver",	/* short description */
428 	&ibd_dev_ops			/* driver specific ops */
429 };
430 
431 /* Module Linkage */
432 static struct modlinkage ibd_modlinkage = {
433 	MODREV_1, (void *)&ibd_modldrv, NULL
434 };
435 
436 /*
437  * Module (static) info passed to IBTL during ibt_attach
438  */
439 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
440 	IBTI_V_CURR,
441 	IBT_NETWORK,
442 	ibd_async_handler,
443 	NULL,
444 	"IPIB"
445 };
446 
447 /*
448  * GLDv3 entry points
449  */
450 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
451 static mac_callbacks_t ibd_m_callbacks = {
452 	IBD_M_CALLBACK_FLAGS,
453 	ibd_m_stat,
454 	ibd_m_start,
455 	ibd_m_stop,
456 	ibd_m_promisc,
457 	ibd_m_multicst,
458 	ibd_m_unicst,
459 	ibd_m_tx,
460 	NULL,
461 	ibd_m_getcapab
462 };
463 
464 /*
465  * Fill/clear <scope> and <p_key> in multicast/broadcast address
466  */
467 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
468 {							\
469 	*(uint32_t *)((char *)(maddr) + 4) |=		\
470 	    htonl((uint32_t)(scope) << 16);		\
471 	*(uint32_t *)((char *)(maddr) + 8) |=		\
472 	    htonl((uint32_t)(pkey) << 16);		\
473 }
474 
475 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
476 {							\
477 	*(uint32_t *)((char *)(maddr) + 4) &=		\
478 	    htonl(~((uint32_t)0xF << 16));		\
479 	*(uint32_t *)((char *)(maddr) + 8) &=		\
480 	    htonl(~((uint32_t)0xFFFF << 16));		\
481 }
482 
483 /*
484  * Rudimentary debugging support
485  */
486 #ifdef DEBUG
487 int ibd_debuglevel = 100;
488 void
489 debug_print(int l, char *fmt, ...)
490 {
491 	va_list ap;
492 
493 	if (l < ibd_debuglevel)
494 		return;
495 	va_start(ap, fmt);
496 	vcmn_err(CE_CONT, fmt, ap);
497 	va_end(ap);
498 }
499 #endif
500 
501 /*
502  * Common routine to print warning messages; adds in hca guid, port number
503  * and pkey to be able to identify the IBA interface.
504  */
505 void
506 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
507 {
508 	ib_guid_t hca_guid;
509 	char ibd_print_buf[256];
510 	int len;
511 	va_list ap;
512 
513 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
514 	    0, "hca-guid", 0);
515 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
516 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
517 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
518 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
519 	va_start(ap, fmt);
520 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
521 	    fmt, ap);
522 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
523 	va_end(ap);
524 }
525 
526 /*
527  * Warlock directives
528  */
529 
530 /*
531  * id_lso_lock
532  *
533  * state->id_lso->bkt_nfree may be accessed without a lock to
534  * determine the threshold at which we have to ask the nw layer
535  * to resume transmission (see ibd_resume_transmission()).
536  */
537 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
538     ibd_state_t::id_lso))
539 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
540 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
541 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
542 
543 /*
544  * id_scq_poll_lock
545  */
546 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
547     ibd_state_t::id_scq_poll_busy))
548 
549 /*
550  * id_txpost_lock
551  */
552 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
553     ibd_state_t::id_tx_head))
554 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
555     ibd_state_t::id_tx_busy))
556 
557 /*
558  * id_acache_req_lock
559  */
560 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
561     ibd_state_t::id_acache_req_cv))
562 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
563     ibd_state_t::id_req_list))
564 _NOTE(SCHEME_PROTECTS_DATA("atomic",
565     ibd_acache_s::ac_ref))
566 
567 /*
568  * id_ac_mutex
569  *
570  * This mutex is actually supposed to protect id_ah_op as well,
571  * but this path of the code isn't clean (see update of id_ah_op
572  * in ibd_async_acache(), immediately after the call to
573  * ibd_async_mcache()). For now, we'll skip this check by
574  * declaring that id_ah_op is protected by some internal scheme
575  * that warlock isn't aware of.
576  */
577 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
578     ibd_state_t::id_ah_active))
579 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
580     ibd_state_t::id_ah_free))
581 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
582     ibd_state_t::id_ah_addr))
583 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
584     ibd_state_t::id_ah_op))
585 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
586     ibd_state_t::id_ah_error))
587 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
588     ibd_state_t::id_ac_hot_ace))
589 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
590 
591 /*
592  * id_mc_mutex
593  */
594 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
595     ibd_state_t::id_mc_full))
596 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
597     ibd_state_t::id_mc_non))
598 
599 /*
600  * id_trap_lock
601  */
602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
603     ibd_state_t::id_trap_cv))
604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
605     ibd_state_t::id_trap_stop))
606 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
607     ibd_state_t::id_trap_inprog))
608 
609 /*
610  * id_prom_op
611  */
612 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
613     ibd_state_t::id_prom_op))
614 
615 /*
616  * id_sched_lock
617  */
618 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
619     ibd_state_t::id_sched_needed))
620 
621 /*
622  * id_link_mutex
623  */
624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
625     ibd_state_t::id_link_state))
626 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
627 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
628     ibd_state_t::id_link_speed))
629 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
630 
631 /*
632  * id_tx_list.dl_mutex
633  */
634 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
635     ibd_state_t::id_tx_list.dl_head))
636 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
637     ibd_state_t::id_tx_list.dl_pending_sends))
638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
639     ibd_state_t::id_tx_list.dl_cnt))
640 
641 /*
642  * id_rx_list.dl_mutex
643  */
644 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
645     ibd_state_t::id_rx_list.dl_bufs_outstanding))
646 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
647     ibd_state_t::id_rx_list.dl_cnt))
648 
649 
650 /*
651  * Items protected by atomic updates
652  */
653 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
654     ibd_state_s::id_brd_rcv
655     ibd_state_s::id_brd_xmt
656     ibd_state_s::id_multi_rcv
657     ibd_state_s::id_multi_xmt
658     ibd_state_s::id_num_intrs
659     ibd_state_s::id_rcv_bytes
660     ibd_state_s::id_rcv_pkt
661     ibd_state_s::id_rx_post_queue_index
662     ibd_state_s::id_tx_short
663     ibd_state_s::id_xmt_bytes
664     ibd_state_s::id_xmt_pkt
665     ibd_state_s::rc_rcv_trans_byte
666     ibd_state_s::rc_rcv_trans_pkt
667     ibd_state_s::rc_rcv_copy_byte
668     ibd_state_s::rc_rcv_copy_pkt
669     ibd_state_s::rc_xmt_bytes
670     ibd_state_s::rc_xmt_small_pkt
671     ibd_state_s::rc_xmt_fragmented_pkt
672     ibd_state_s::rc_xmt_map_fail_pkt
673     ibd_state_s::rc_xmt_map_succ_pkt))
674 
675 /*
676  * Non-mutex protection schemes for data elements. Almost all of
677  * these are non-shared items.
678  */
679 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
680     callb_cpr
681     ib_gid_s
682     ib_header_info
683     ibd_acache_rq
684     ibd_acache_s::ac_mce
685     ibd_acache_s::ac_chan
686     ibd_mcache::mc_fullreap
687     ibd_mcache::mc_jstate
688     ibd_mcache::mc_req
689     ibd_rwqe_s
690     ibd_swqe_s
691     ibd_wqe_s
692     ibt_wr_ds_s::ds_va
693     ibt_wr_lso_s
694     ipoib_mac::ipoib_qpn
695     mac_capab_lso_s
696     msgb::b_next
697     msgb::b_cont
698     msgb::b_rptr
699     msgb::b_wptr
700     ibd_state_s::id_bgroup_created
701     ibd_state_s::id_mac_state
702     ibd_state_s::id_mtu
703     ibd_state_s::id_num_rwqe
704     ibd_state_s::id_num_swqe
705     ibd_state_s::id_qpnum
706     ibd_state_s::id_rcq_hdl
707     ibd_state_s::id_rx_buf_sz
708     ibd_state_s::id_rx_bufs
709     ibd_state_s::id_rx_mr_hdl
710     ibd_state_s::id_rx_wqes
711     ibd_state_s::id_rxwcs
712     ibd_state_s::id_rxwcs_size
713     ibd_state_s::id_rx_nqueues
714     ibd_state_s::id_rx_queues
715     ibd_state_s::id_scope
716     ibd_state_s::id_scq_hdl
717     ibd_state_s::id_tx_buf_sz
718     ibd_state_s::id_tx_bufs
719     ibd_state_s::id_tx_mr_hdl
720     ibd_state_s::id_tx_rel_list.dl_cnt
721     ibd_state_s::id_tx_wqes
722     ibd_state_s::id_txwcs
723     ibd_state_s::id_txwcs_size
724     ibd_state_s::rc_listen_hdl
725     ibd_state_s::rc_listen_hdl_OFED_interop
726     ibd_state_s::rc_srq_size
727     ibd_state_s::rc_srq_rwqes
728     ibd_state_s::rc_srq_rx_bufs
729     ibd_state_s::rc_srq_rx_mr_hdl
730     ibd_state_s::rc_tx_largebuf_desc_base
731     ibd_state_s::rc_tx_mr_bufs
732     ibd_state_s::rc_tx_mr_hdl
733     ipha_s
734     icmph_s
735     ibt_path_info_s::pi_sid
736     ibd_rc_chan_s::ace
737     ibd_rc_chan_s::chan_hdl
738     ibd_rc_chan_s::state
739     ibd_rc_chan_s::chan_state
740     ibd_rc_chan_s::is_tx_chan
741     ibd_rc_chan_s::rcq_hdl
742     ibd_rc_chan_s::rcq_size
743     ibd_rc_chan_s::scq_hdl
744     ibd_rc_chan_s::scq_size
745     ibd_rc_chan_s::requester_gid
746     ibd_rc_chan_s::requester_pkey
747     ibd_rc_chan_s::rx_bufs
748     ibd_rc_chan_s::rx_mr_hdl
749     ibd_rc_chan_s::rx_rwqes
750     ibd_rc_chan_s::tx_wqes
751     ibd_rc_chan_s::tx_mr_bufs
752     ibd_rc_chan_s::tx_mr_hdl
753     ibd_rc_chan_s::tx_rel_list.dl_cnt
754     ibd_rc_chan_s::tx_trans_error_cnt
755     ibd_rc_tx_largebuf_s::lb_buf
756     ibd_rc_msg_hello_s
757     ibt_cm_return_args_s))
758 
759 /*
760  * ibd_rc_chan_s::next is protected by two mutexes:
761  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
762  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
763  */
764 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
765     ibd_rc_chan_s::next))
766 
767 /*
768  * ibd_state_s.rc_tx_large_bufs_lock
769  */
770 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
771     ibd_state_s::rc_tx_largebuf_free_head))
772 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
773     ibd_state_s::rc_tx_largebuf_nfree))
774 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
775     ibd_rc_tx_largebuf_s::lb_next))
776 
777 /*
778  * ibd_acache_s.tx_too_big_mutex
779  */
780 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
781     ibd_acache_s::tx_too_big_ongoing))
782 
783 /*
784  * tx_wqe_list.dl_mutex
785  */
786 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
787     ibd_rc_chan_s::tx_wqe_list.dl_head))
788 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
789     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
790 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
791     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
792 
793 /*
794  * ibd_state_s.rc_ace_recycle_lock
795  */
796 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
797     ibd_state_s::rc_ace_recycle))
798 
799 /*
800  * rc_srq_rwqe_list.dl_mutex
801  */
802 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
803     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
804 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
805     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
806 
807 /*
808  * Non-mutex protection schemes for data elements. They are counters
809  * for problem diagnosis. Don't need be protected.
810  */
811 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
812     ibd_state_s::rc_rcv_alloc_fail
813     ibd_state_s::rc_rcq_invoke
814     ibd_state_s::rc_rcq_err
815     ibd_state_s::rc_ace_not_found
816     ibd_state_s::rc_xmt_drop_too_long_pkt
817     ibd_state_s::rc_xmt_icmp_too_long_pkt
818     ibd_state_s::rc_xmt_reenter_too_long_pkt
819     ibd_state_s::rc_swqe_short
820     ibd_state_s::rc_swqe_mac_update
821     ibd_state_s::rc_xmt_buf_short
822     ibd_state_s::rc_xmt_buf_mac_update
823     ibd_state_s::rc_scq_no_swqe
824     ibd_state_s::rc_scq_no_largebuf
825     ibd_state_s::rc_scq_invoke
826     ibd_state_s::rc_conn_succ
827     ibd_state_s::rc_conn_fail
828     ibd_state_s::rc_null_conn
829     ibd_state_s::rc_no_estab_conn
830     ibd_state_s::rc_act_close
831     ibd_state_s::rc_pas_close
832     ibd_state_s::rc_delay_ace_recycle
833     ibd_state_s::rc_act_close_simultaneous
834     ibd_state_s::rc_reset_cnt))
835 
836 #ifdef DEBUG
837 /*
838  * Non-mutex protection schemes for data elements. They are counters
839  * for problem diagnosis. Don't need be protected.
840  */
841 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
842     ibd_state_s::rc_rwqe_short
843     ibd_rc_stat_s::rc_rcv_trans_byte
844     ibd_rc_stat_s::rc_rcv_trans_pkt
845     ibd_rc_stat_s::rc_rcv_copy_byte
846     ibd_rc_stat_s::rc_rcv_copy_pkt
847     ibd_rc_stat_s::rc_rcv_alloc_fail
848     ibd_rc_stat_s::rc_rcq_invoke
849     ibd_rc_stat_s::rc_rcq_err
850     ibd_rc_stat_s::rc_scq_invoke
851     ibd_rc_stat_s::rc_rwqe_short
852     ibd_rc_stat_s::rc_xmt_bytes
853     ibd_rc_stat_s::rc_xmt_small_pkt
854     ibd_rc_stat_s::rc_xmt_fragmented_pkt
855     ibd_rc_stat_s::rc_xmt_map_fail_pkt
856     ibd_rc_stat_s::rc_xmt_map_succ_pkt
857     ibd_rc_stat_s::rc_ace_not_found
858     ibd_rc_stat_s::rc_scq_no_swqe
859     ibd_rc_stat_s::rc_scq_no_largebuf
860     ibd_rc_stat_s::rc_swqe_short
861     ibd_rc_stat_s::rc_swqe_mac_update
862     ibd_rc_stat_s::rc_xmt_buf_short
863     ibd_rc_stat_s::rc_xmt_buf_mac_update
864     ibd_rc_stat_s::rc_conn_succ
865     ibd_rc_stat_s::rc_conn_fail
866     ibd_rc_stat_s::rc_null_conn
867     ibd_rc_stat_s::rc_no_estab_conn
868     ibd_rc_stat_s::rc_act_close
869     ibd_rc_stat_s::rc_pas_close
870     ibd_rc_stat_s::rc_delay_ace_recycle
871     ibd_rc_stat_s::rc_act_close_simultaneous
872     ibd_rc_stat_s::rc_reset_cnt))
873 #endif
874 
875 int
876 _init()
877 {
878 	int status;
879 
880 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
881 	    PAGESIZE), 0);
882 	if (status != 0) {
883 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
884 		return (status);
885 	}
886 
887 	mac_init_ops(&ibd_dev_ops, "ibd");
888 	status = mod_install(&ibd_modlinkage);
889 	if (status != 0) {
890 		DPRINT(10, "_init:failed in mod_install()");
891 		ddi_soft_state_fini(&ibd_list);
892 		mac_fini_ops(&ibd_dev_ops);
893 		return (status);
894 	}
895 
896 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
897 	mutex_enter(&ibd_gstate.ig_mutex);
898 	ibd_gstate.ig_ibt_hdl = NULL;
899 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
900 	ibd_gstate.ig_service_list = NULL;
901 	mutex_exit(&ibd_gstate.ig_mutex);
902 
903 #ifdef IBD_LOGGING
904 	ibd_log_init();
905 #endif
906 	return (0);
907 }
908 
909 int
910 _info(struct modinfo *modinfop)
911 {
912 	return (mod_info(&ibd_modlinkage, modinfop));
913 }
914 
915 int
916 _fini()
917 {
918 	int status;
919 
920 	status = mod_remove(&ibd_modlinkage);
921 	if (status != 0)
922 		return (status);
923 
924 	mac_fini_ops(&ibd_dev_ops);
925 	ddi_soft_state_fini(&ibd_list);
926 	mutex_destroy(&ibd_gstate.ig_mutex);
927 #ifdef IBD_LOGGING
928 	ibd_log_fini();
929 #endif
930 	return (0);
931 }
932 
933 /*
934  * Convert the GID part of the mac address from network byte order
935  * to host order.
936  */
937 static void
938 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
939 {
940 	ib_sn_prefix_t nbopref;
941 	ib_guid_t nboguid;
942 
943 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
944 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
945 	dgid->gid_prefix = b2h64(nbopref);
946 	dgid->gid_guid = b2h64(nboguid);
947 }
948 
949 /*
950  * Create the IPoIB address in network byte order from host order inputs.
951  */
952 static void
953 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
954     ib_guid_t guid)
955 {
956 	ib_sn_prefix_t nbopref;
957 	ib_guid_t nboguid;
958 
959 	mac->ipoib_qpn = htonl(qpn);
960 	nbopref = h2b64(prefix);
961 	nboguid = h2b64(guid);
962 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
963 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
964 }
965 
966 /*
967  * Send to the appropriate all-routers group when the IBA multicast group
968  * does not exist, based on whether the target group is v4 or v6.
969  */
970 static boolean_t
971 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
972     ipoib_mac_t *rmac)
973 {
974 	boolean_t retval = B_TRUE;
975 	uint32_t adjscope = state->id_scope << 16;
976 	uint32_t topword;
977 
978 	/*
979 	 * Copy the first 4 bytes in without assuming any alignment of
980 	 * input mac address; this will have IPoIB signature, flags and
981 	 * scope bits.
982 	 */
983 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
984 	topword = ntohl(topword);
985 
986 	/*
987 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
988 	 */
989 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
990 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
991 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
992 		    ((uint32_t)(state->id_pkey << 16))),
993 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
994 	else
995 		/*
996 		 * Does not have proper bits in the mgid address.
997 		 */
998 		retval = B_FALSE;
999 
1000 	return (retval);
1001 }
1002 
1003 /*
1004  * Membership states for different mcg's are tracked by two lists:
1005  * the "non" list is used for promiscuous mode, when all mcg traffic
1006  * needs to be inspected. This type of membership is never used for
1007  * transmission, so there can not be an AH in the active list
1008  * corresponding to a member in this list. This list does not need
1009  * any protection, since all operations are performed by the async
1010  * thread.
1011  *
1012  * "Full" and "SendOnly" membership is tracked using a single list,
1013  * the "full" list. This is because this single list can then be
1014  * searched during transmit to a multicast group (if an AH for the
1015  * mcg is not found in the active list), since at least one type
1016  * of membership must be present before initiating the transmit.
1017  * This list is also emptied during driver detach, since sendonly
1018  * membership acquired during transmit is dropped at detach time
1019  * along with ipv4 broadcast full membership. Insert/deletes to
1020  * this list are done only by the async thread, but it is also
1021  * searched in program context (see multicast disable case), thus
1022  * the id_mc_mutex protects the list. The driver detach path also
1023  * deconstructs the "full" list, but it ensures that the async
1024  * thread will not be accessing the list (by blocking out mcg
1025  * trap handling and making sure no more Tx reaping will happen).
1026  *
1027  * Currently, an IBA attach is done in the SendOnly case too,
1028  * although this is not required.
1029  */
1030 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1031 	list_insert_head(&state->id_mc_full, mce)
1032 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1033 	list_insert_head(&state->id_mc_non, mce)
1034 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1035 	ibd_mcache_find(mgid, &state->id_mc_full)
1036 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1037 	ibd_mcache_find(mgid, &state->id_mc_non)
1038 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1039 	list_remove(&state->id_mc_full, mce)
1040 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1041 	list_remove(&state->id_mc_non, mce)
1042 
1043 static void *
1044 list_get_head(list_t *list)
1045 {
1046 	list_node_t *lhead = list_head(list);
1047 
1048 	if (lhead != NULL)
1049 		list_remove(list, lhead);
1050 	return (lhead);
1051 }
1052 
1053 /*
1054  * This is always guaranteed to be able to queue the work.
1055  */
1056 void
1057 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1058 {
1059 	/* Initialize request */
1060 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1061 	ptr->rq_op = op;
1062 
1063 	/*
1064 	 * Queue provided slot onto request pool.
1065 	 */
1066 	mutex_enter(&state->id_acache_req_lock);
1067 	list_insert_tail(&state->id_req_list, ptr);
1068 
1069 	/* Go, fetch, async thread */
1070 	cv_signal(&state->id_acache_req_cv);
1071 	mutex_exit(&state->id_acache_req_lock);
1072 }
1073 
1074 /*
1075  * Main body of the per interface async thread.
1076  */
1077 static void
1078 ibd_async_work(ibd_state_t *state)
1079 {
1080 	ibd_req_t *ptr;
1081 	callb_cpr_t cprinfo;
1082 
1083 	mutex_enter(&state->id_acache_req_lock);
1084 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1085 	    callb_generic_cpr, "ibd_async_work");
1086 
1087 	for (;;) {
1088 		ptr = list_get_head(&state->id_req_list);
1089 		if (ptr != NULL) {
1090 			mutex_exit(&state->id_acache_req_lock);
1091 
1092 			/*
1093 			 * Once we have done the operation, there is no
1094 			 * guarantee the request slot is going to be valid,
1095 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1096 			 * TRAP).
1097 			 *
1098 			 * Perform the request.
1099 			 */
1100 			switch (ptr->rq_op) {
1101 				case IBD_ASYNC_GETAH:
1102 					ibd_async_acache(state, &ptr->rq_mac);
1103 					break;
1104 				case IBD_ASYNC_JOIN:
1105 				case IBD_ASYNC_LEAVE:
1106 					ibd_async_multicast(state,
1107 					    ptr->rq_gid, ptr->rq_op);
1108 					break;
1109 				case IBD_ASYNC_PROMON:
1110 					ibd_async_setprom(state);
1111 					break;
1112 				case IBD_ASYNC_PROMOFF:
1113 					ibd_async_unsetprom(state);
1114 					break;
1115 				case IBD_ASYNC_REAP:
1116 					ibd_async_reap_group(state,
1117 					    ptr->rq_ptr, ptr->rq_gid,
1118 					    IB_MC_JSTATE_FULL);
1119 					/*
1120 					 * the req buf contains in mce
1121 					 * structure, so we do not need
1122 					 * to free it here.
1123 					 */
1124 					ptr = NULL;
1125 					break;
1126 				case IBD_ASYNC_TRAP:
1127 					ibd_async_trap(state, ptr);
1128 					break;
1129 				case IBD_ASYNC_SCHED:
1130 					ibd_async_txsched(state);
1131 					break;
1132 				case IBD_ASYNC_LINK:
1133 					ibd_async_link(state, ptr);
1134 					break;
1135 				case IBD_ASYNC_EXIT:
1136 					mutex_enter(&state->id_acache_req_lock);
1137 #ifndef __lock_lint
1138 					CALLB_CPR_EXIT(&cprinfo);
1139 #else
1140 					mutex_exit(&state->id_acache_req_lock);
1141 #endif
1142 					return;
1143 				case IBD_ASYNC_RC_TOO_BIG:
1144 					ibd_async_rc_process_too_big(state,
1145 					    ptr);
1146 					break;
1147 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1148 					ibd_async_rc_close_act_chan(state, ptr);
1149 					break;
1150 				case IBD_ASYNC_RC_RECYCLE_ACE:
1151 					ibd_async_rc_recycle_ace(state, ptr);
1152 					break;
1153 			}
1154 			if (ptr != NULL)
1155 				kmem_cache_free(state->id_req_kmc, ptr);
1156 
1157 			mutex_enter(&state->id_acache_req_lock);
1158 		} else {
1159 #ifndef __lock_lint
1160 			/*
1161 			 * Nothing to do: wait till new request arrives.
1162 			 */
1163 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1164 			cv_wait(&state->id_acache_req_cv,
1165 			    &state->id_acache_req_lock);
1166 			CALLB_CPR_SAFE_END(&cprinfo,
1167 			    &state->id_acache_req_lock);
1168 #endif
1169 		}
1170 	}
1171 
1172 	/*NOTREACHED*/
1173 	_NOTE(NOT_REACHED)
1174 }
1175 
1176 /*
1177  * Return when it is safe to queue requests to the async daemon; primarily
1178  * for subnet trap and async event handling. Disallow requests before the
1179  * daemon is created, and when interface deinitilization starts.
1180  */
1181 static boolean_t
1182 ibd_async_safe(ibd_state_t *state)
1183 {
1184 	mutex_enter(&state->id_trap_lock);
1185 	if (state->id_trap_stop) {
1186 		mutex_exit(&state->id_trap_lock);
1187 		return (B_FALSE);
1188 	}
1189 	state->id_trap_inprog++;
1190 	mutex_exit(&state->id_trap_lock);
1191 	return (B_TRUE);
1192 }
1193 
1194 /*
1195  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1196  * trap or event handling to complete to kill the async thread and deconstruct
1197  * the mcg/ace list.
1198  */
1199 static void
1200 ibd_async_done(ibd_state_t *state)
1201 {
1202 	mutex_enter(&state->id_trap_lock);
1203 	if (--state->id_trap_inprog == 0)
1204 		cv_signal(&state->id_trap_cv);
1205 	mutex_exit(&state->id_trap_lock);
1206 }
1207 
1208 /*
1209  * Hash functions:
1210  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1211  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1212  * These operate on mac addresses input into ibd_send, but there is no
1213  * guarantee on the alignment of the ipoib_mac_t structure.
1214  */
1215 /*ARGSUSED*/
1216 static uint_t
1217 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1218 {
1219 	ulong_t ptraddr = (ulong_t)key;
1220 	uint_t hval;
1221 
1222 	/*
1223 	 * If the input address is 4 byte aligned, we can just dereference
1224 	 * it. This is most common, since IP will send in a 4 byte aligned
1225 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1226 	 * 4 byte aligned too.
1227 	 */
1228 	if ((ptraddr & 3) == 0)
1229 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1230 
1231 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1232 	return (hval);
1233 }
1234 
1235 static int
1236 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1237 {
1238 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1239 		return (0);
1240 	else
1241 		return (1);
1242 }
1243 
1244 /*
1245  * Initialize all the per interface caches and lists; AH cache,
1246  * MCG list etc.
1247  */
1248 static int
1249 ibd_acache_init(ibd_state_t *state)
1250 {
1251 	ibd_ace_t *ce;
1252 	int i;
1253 
1254 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1255 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1256 
1257 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1258 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1259 	mutex_enter(&state->id_ac_mutex);
1260 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1261 	    offsetof(ibd_ace_t, ac_list));
1262 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1263 	    offsetof(ibd_ace_t, ac_list));
1264 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1265 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1266 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1267 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1268 	    offsetof(ibd_mce_t, mc_list));
1269 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1270 	    offsetof(ibd_mce_t, mc_list));
1271 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1272 	    offsetof(ibd_req_t, rq_list));
1273 	state->id_ac_hot_ace = NULL;
1274 
1275 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1276 	    IBD_NUM_AH, KM_SLEEP);
1277 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1278 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1279 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1280 			mutex_exit(&state->id_ac_mutex);
1281 			ibd_acache_fini(state);
1282 			return (DDI_FAILURE);
1283 		} else {
1284 			CLEAR_REFCYCLE(ce);
1285 			ce->ac_mce = NULL;
1286 			mutex_init(&ce->tx_too_big_mutex, NULL,
1287 			    MUTEX_DRIVER, NULL);
1288 			IBD_ACACHE_INSERT_FREE(state, ce);
1289 		}
1290 	}
1291 	mutex_exit(&state->id_ac_mutex);
1292 	return (DDI_SUCCESS);
1293 }
1294 
1295 static void
1296 ibd_acache_fini(ibd_state_t *state)
1297 {
1298 	ibd_ace_t *ptr;
1299 
1300 	mutex_enter(&state->id_ac_mutex);
1301 
1302 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1303 		ASSERT(GET_REF(ptr) == 0);
1304 		mutex_destroy(&ptr->tx_too_big_mutex);
1305 		(void) ibt_free_ud_dest(ptr->ac_dest);
1306 	}
1307 
1308 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1309 		ASSERT(GET_REF(ptr) == 0);
1310 		mutex_destroy(&ptr->tx_too_big_mutex);
1311 		(void) ibt_free_ud_dest(ptr->ac_dest);
1312 	}
1313 
1314 	list_destroy(&state->id_ah_free);
1315 	list_destroy(&state->id_ah_active);
1316 	list_destroy(&state->id_mc_full);
1317 	list_destroy(&state->id_mc_non);
1318 	list_destroy(&state->id_req_list);
1319 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1320 	mutex_exit(&state->id_ac_mutex);
1321 	mutex_destroy(&state->id_ac_mutex);
1322 	mutex_destroy(&state->id_mc_mutex);
1323 	mutex_destroy(&state->id_acache_req_lock);
1324 	cv_destroy(&state->id_acache_req_cv);
1325 }
1326 
1327 /*
1328  * Search AH active hash list for a cached path to input destination.
1329  * If we are "just looking", hold == F. When we are in the Tx path,
1330  * we set hold == T to grab a reference on the AH so that it can not
1331  * be recycled to a new destination while the Tx request is posted.
1332  */
1333 ibd_ace_t *
1334 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1335 {
1336 	ibd_ace_t *ptr;
1337 
1338 	ASSERT(mutex_owned(&state->id_ac_mutex));
1339 
1340 	/*
1341 	 * Do hash search.
1342 	 */
1343 	if (mod_hash_find(state->id_ah_active_hash,
1344 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1345 		if (hold)
1346 			INC_REF(ptr, num);
1347 		return (ptr);
1348 	}
1349 	return (NULL);
1350 }
1351 
1352 /*
1353  * This is called by the tx side; if an initialized AH is found in
1354  * the active list, it is locked down and can be used; if no entry
1355  * is found, an async request is queued to do path resolution.
1356  */
1357 static ibd_ace_t *
1358 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1359 {
1360 	ibd_ace_t *ptr;
1361 	ibd_req_t *req;
1362 
1363 	/*
1364 	 * Only attempt to print when we can; in the mdt pattr case, the
1365 	 * address is not aligned properly.
1366 	 */
1367 	if (((ulong_t)mac & 3) == 0) {
1368 		DPRINT(4,
1369 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1370 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1371 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1372 		    htonl(mac->ipoib_gidsuff[1]));
1373 	}
1374 
1375 	mutex_enter(&state->id_ac_mutex);
1376 
1377 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1378 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1379 		INC_REF(ptr, numwqe);
1380 		mutex_exit(&state->id_ac_mutex);
1381 		return (ptr);
1382 	}
1383 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1384 		state->id_ac_hot_ace = ptr;
1385 		mutex_exit(&state->id_ac_mutex);
1386 		return (ptr);
1387 	}
1388 
1389 	/*
1390 	 * Implementation of a single outstanding async request; if
1391 	 * the operation is not started yet, queue a request and move
1392 	 * to ongoing state. Remember in id_ah_addr for which address
1393 	 * we are queueing the request, in case we need to flag an error;
1394 	 * Any further requests, for the same or different address, until
1395 	 * the operation completes, is sent back to GLDv3 to be retried.
1396 	 * The async thread will update id_ah_op with an error indication
1397 	 * or will set it to indicate the next look up can start; either
1398 	 * way, it will mac_tx_update() so that all blocked requests come
1399 	 * back here.
1400 	 */
1401 	*err = EAGAIN;
1402 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1403 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1404 		if (req != NULL) {
1405 			/*
1406 			 * We did not even find the entry; queue a request
1407 			 * for it.
1408 			 */
1409 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1410 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1411 			state->id_ah_op = IBD_OP_ONGOING;
1412 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1413 		}
1414 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1415 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1416 		/*
1417 		 * Check the status of the pathrecord lookup request
1418 		 * we had queued before.
1419 		 */
1420 		if (state->id_ah_op == IBD_OP_ERRORED) {
1421 			*err = EFAULT;
1422 			state->id_ah_error++;
1423 		} else {
1424 			/*
1425 			 * IBD_OP_ROUTERED case: We need to send to the
1426 			 * all-router MCG. If we can find the AH for
1427 			 * the mcg, the Tx will be attempted. If we
1428 			 * do not find the AH, we return NORESOURCES
1429 			 * to retry.
1430 			 */
1431 			ipoib_mac_t routermac;
1432 
1433 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1434 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1435 			    numwqe);
1436 		}
1437 		state->id_ah_op = IBD_OP_NOTSTARTED;
1438 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1439 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1440 		/*
1441 		 * This case can happen when we get a higher band
1442 		 * packet. The easiest way is to reset the state machine
1443 		 * to accommodate the higher priority packet.
1444 		 */
1445 		state->id_ah_op = IBD_OP_NOTSTARTED;
1446 	}
1447 	mutex_exit(&state->id_ac_mutex);
1448 
1449 	return (ptr);
1450 }
1451 
1452 /*
1453  * Grab a not-currently-in-use AH/PathRecord from the active
1454  * list to recycle to a new destination. Only the async thread
1455  * executes this code.
1456  */
1457 static ibd_ace_t *
1458 ibd_acache_get_unref(ibd_state_t *state)
1459 {
1460 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1461 	boolean_t try_rc_chan_recycle = B_FALSE;
1462 
1463 	ASSERT(mutex_owned(&state->id_ac_mutex));
1464 
1465 	/*
1466 	 * Do plain linear search.
1467 	 */
1468 	while (ptr != NULL) {
1469 		/*
1470 		 * Note that it is possible that the "cycle" bit
1471 		 * is set on the AH w/o any reference count. The
1472 		 * mcg must have been deleted, and the tx cleanup
1473 		 * just decremented the reference count to 0, but
1474 		 * hasn't gotten around to grabbing the id_ac_mutex
1475 		 * to move the AH into the free list.
1476 		 */
1477 		if (GET_REF(ptr) == 0) {
1478 			if (ptr->ac_chan != NULL) {
1479 				ASSERT(state->id_enable_rc == B_TRUE);
1480 				if (!try_rc_chan_recycle) {
1481 					try_rc_chan_recycle = B_TRUE;
1482 					ibd_rc_signal_ace_recycle(state, ptr);
1483 				}
1484 			} else {
1485 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1486 				break;
1487 			}
1488 		}
1489 		ptr = list_prev(&state->id_ah_active, ptr);
1490 	}
1491 	return (ptr);
1492 }
1493 
1494 /*
1495  * Invoked to clean up AH from active list in case of multicast
1496  * disable and to handle sendonly memberships during mcg traps.
1497  * And for port up processing for multicast and unicast AHs.
1498  * Normally, the AH is taken off the active list, and put into
1499  * the free list to be recycled for a new destination. In case
1500  * Tx requests on the AH have not completed yet, the AH is marked
1501  * for reaping (which will put the AH on the free list) once the Tx's
1502  * complete; in this case, depending on the "force" input, we take
1503  * out the AH from the active list right now, or leave it also for
1504  * the reap operation. Returns TRUE if the AH is taken off the active
1505  * list (and either put into the free list right now, or arranged for
1506  * later), FALSE otherwise.
1507  */
1508 boolean_t
1509 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1510 {
1511 	ibd_ace_t *acactive;
1512 	boolean_t ret = B_TRUE;
1513 
1514 	ASSERT(mutex_owned(&state->id_ac_mutex));
1515 
1516 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1517 
1518 		/*
1519 		 * Note that the AH might already have the cycle bit set
1520 		 * on it; this might happen if sequences of multicast
1521 		 * enables and disables are coming so fast, that posted
1522 		 * Tx's to the mcg have not completed yet, and the cycle
1523 		 * bit is set successively by each multicast disable.
1524 		 */
1525 		if (SET_CYCLE_IF_REF(acactive)) {
1526 			if (!force) {
1527 				/*
1528 				 * The ace is kept on the active list, further
1529 				 * Tx's can still grab a reference on it; the
1530 				 * ace is reaped when all pending Tx's
1531 				 * referencing the AH complete.
1532 				 */
1533 				ret = B_FALSE;
1534 			} else {
1535 				/*
1536 				 * In the mcg trap case, we always pull the
1537 				 * AH from the active list. And also the port
1538 				 * up multi/unicast case.
1539 				 */
1540 				ASSERT(acactive->ac_chan == NULL);
1541 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1542 				acactive->ac_mce = NULL;
1543 			}
1544 		} else {
1545 			/*
1546 			 * Determined the ref count is 0, thus reclaim
1547 			 * immediately after pulling out the ace from
1548 			 * the active list.
1549 			 */
1550 			ASSERT(acactive->ac_chan == NULL);
1551 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1552 			acactive->ac_mce = NULL;
1553 			IBD_ACACHE_INSERT_FREE(state, acactive);
1554 		}
1555 
1556 	}
1557 	return (ret);
1558 }
1559 
1560 /*
1561  * Helper function for async path record lookup. If we are trying to
1562  * Tx to a MCG, check our membership, possibly trying to join the
1563  * group if required. If that fails, try to send the packet to the
1564  * all router group (indicated by the redirect output), pointing
1565  * the input mac address to the router mcg address.
1566  */
1567 static ibd_mce_t *
1568 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1569 {
1570 	ib_gid_t mgid;
1571 	ibd_mce_t *mce;
1572 	ipoib_mac_t routermac;
1573 
1574 	*redirect = B_FALSE;
1575 	ibd_n2h_gid(mac, &mgid);
1576 
1577 	/*
1578 	 * Check the FullMember+SendOnlyNonMember list.
1579 	 * Since we are the only one who manipulates the
1580 	 * id_mc_full list, no locks are needed.
1581 	 */
1582 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1583 	if (mce != NULL) {
1584 		DPRINT(4, "ibd_async_mcache : already joined to group");
1585 		return (mce);
1586 	}
1587 
1588 	/*
1589 	 * Not found; try to join(SendOnlyNonMember) and attach.
1590 	 */
1591 	DPRINT(4, "ibd_async_mcache : not joined to group");
1592 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1593 	    NULL) {
1594 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1595 		return (mce);
1596 	}
1597 
1598 	/*
1599 	 * MCGroup not present; try to join the all-router group. If
1600 	 * any of the following steps succeed, we will be redirecting
1601 	 * to the all router group.
1602 	 */
1603 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1604 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1605 		return (NULL);
1606 	*redirect = B_TRUE;
1607 	ibd_n2h_gid(&routermac, &mgid);
1608 	bcopy(&routermac, mac, IPOIB_ADDRL);
1609 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1610 	    mgid.gid_prefix, mgid.gid_guid);
1611 
1612 	/*
1613 	 * Are we already joined to the router group?
1614 	 */
1615 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1616 		DPRINT(4, "ibd_async_mcache : using already joined router"
1617 		    "group\n");
1618 		return (mce);
1619 	}
1620 
1621 	/*
1622 	 * Can we join(SendOnlyNonMember) the router group?
1623 	 */
1624 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1625 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1626 	    NULL) {
1627 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1628 		return (mce);
1629 	}
1630 
1631 	return (NULL);
1632 }
1633 
1634 /*
1635  * Async path record lookup code.
1636  */
1637 static void
1638 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1639 {
1640 	ibd_ace_t *ce;
1641 	ibd_mce_t *mce = NULL;
1642 	ibt_path_attr_t path_attr;
1643 	ibt_path_info_t path_info;
1644 	ib_gid_t destgid;
1645 	char ret = IBD_OP_NOTSTARTED;
1646 
1647 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1648 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1649 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1650 	    htonl(mac->ipoib_gidsuff[1]));
1651 
1652 	/*
1653 	 * Check whether we are trying to transmit to a MCG.
1654 	 * In that case, we need to make sure we are a member of
1655 	 * the MCG.
1656 	 */
1657 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1658 		boolean_t redirected;
1659 
1660 		/*
1661 		 * If we can not find or join the group or even
1662 		 * redirect, error out.
1663 		 */
1664 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1665 		    NULL) {
1666 			state->id_ah_op = IBD_OP_ERRORED;
1667 			return;
1668 		}
1669 
1670 		/*
1671 		 * If we got redirected, we need to determine whether
1672 		 * the AH for the new mcg is in the cache already, and
1673 		 * not pull it in then; otherwise proceed to get the
1674 		 * path for the new mcg. There is no guarantee that
1675 		 * if the AH is currently in the cache, it will still be
1676 		 * there when we look in ibd_acache_lookup(), but that's
1677 		 * okay, we will come back here.
1678 		 */
1679 		if (redirected) {
1680 			ret = IBD_OP_ROUTERED;
1681 			DPRINT(4, "ibd_async_acache :  redirected to "
1682 			    "%08X:%08X:%08X:%08X:%08X",
1683 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1684 			    htonl(mac->ipoib_gidpref[1]),
1685 			    htonl(mac->ipoib_gidsuff[0]),
1686 			    htonl(mac->ipoib_gidsuff[1]));
1687 
1688 			mutex_enter(&state->id_ac_mutex);
1689 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1690 				state->id_ah_op = IBD_OP_ROUTERED;
1691 				mutex_exit(&state->id_ac_mutex);
1692 				DPRINT(4, "ibd_async_acache : router AH found");
1693 				return;
1694 			}
1695 			mutex_exit(&state->id_ac_mutex);
1696 		}
1697 	}
1698 
1699 	/*
1700 	 * Get an AH from the free list.
1701 	 */
1702 	mutex_enter(&state->id_ac_mutex);
1703 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1704 		/*
1705 		 * No free ones; try to grab an unreferenced active
1706 		 * one. Maybe we need to make the active list LRU,
1707 		 * but that will create more work for Tx callbacks.
1708 		 * Is there a way of not having to pull out the
1709 		 * entry from the active list, but just indicate it
1710 		 * is being recycled? Yes, but that creates one more
1711 		 * check in the fast lookup path.
1712 		 */
1713 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1714 			/*
1715 			 * Pretty serious shortage now.
1716 			 */
1717 			state->id_ah_op = IBD_OP_NOTSTARTED;
1718 			mutex_exit(&state->id_ac_mutex);
1719 			DPRINT(10, "ibd_async_acache : failed to find AH "
1720 			    "slot\n");
1721 			return;
1722 		}
1723 		/*
1724 		 * We could check whether ac_mce points to a SendOnly
1725 		 * member and drop that membership now. Or do it lazily
1726 		 * at detach time.
1727 		 */
1728 		ce->ac_mce = NULL;
1729 	}
1730 	mutex_exit(&state->id_ac_mutex);
1731 	ASSERT(ce->ac_mce == NULL);
1732 
1733 	/*
1734 	 * Update the entry.
1735 	 */
1736 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1737 
1738 	bzero(&path_info, sizeof (path_info));
1739 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1740 	path_attr.pa_sgid = state->id_sgid;
1741 	path_attr.pa_num_dgids = 1;
1742 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1743 	path_attr.pa_dgids = &destgid;
1744 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1745 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1746 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1747 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1748 		goto error;
1749 	}
1750 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1751 	    ntohl(ce->ac_mac.ipoib_qpn),
1752 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1753 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1754 		goto error;
1755 	}
1756 
1757 	/*
1758 	 * mce is set whenever an AH is being associated with a
1759 	 * MCG; this will come in handy when we leave the MCG. The
1760 	 * lock protects Tx fastpath from scanning the active list.
1761 	 */
1762 	if (mce != NULL)
1763 		ce->ac_mce = mce;
1764 
1765 	/*
1766 	 * initiate a RC mode connection for unicast address
1767 	 */
1768 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1769 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1770 		ASSERT(ce->ac_chan == NULL);
1771 		DPRINT(10, "ibd_async_acache: call "
1772 		    "ibd_rc_try_connect(ace=%p)", ce);
1773 		ibd_rc_try_connect(state, ce, &path_info);
1774 		if (ce->ac_chan == NULL) {
1775 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1776 			    " channel");
1777 			state->rc_conn_fail++;
1778 			goto error;
1779 		}
1780 	}
1781 
1782 	mutex_enter(&state->id_ac_mutex);
1783 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1784 	state->id_ah_op = ret;
1785 	mutex_exit(&state->id_ac_mutex);
1786 	return;
1787 error:
1788 	/*
1789 	 * We might want to drop SendOnly membership here if we
1790 	 * joined above. The lock protects Tx callbacks inserting
1791 	 * into the free list.
1792 	 */
1793 	mutex_enter(&state->id_ac_mutex);
1794 	state->id_ah_op = IBD_OP_ERRORED;
1795 	IBD_ACACHE_INSERT_FREE(state, ce);
1796 	mutex_exit(&state->id_ac_mutex);
1797 }
1798 
1799 /*
1800  * While restoring port's presence on the subnet on a port up, it is possible
1801  * that the port goes down again.
1802  */
1803 static void
1804 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1805 {
1806 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1807 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1808 	    LINK_STATE_UP;
1809 	ibd_mce_t *mce, *pmce;
1810 	ibd_ace_t *ace, *pace;
1811 
1812 	DPRINT(10, "ibd_async_link(): %d", opcode);
1813 
1814 	/*
1815 	 * On a link up, revalidate the link speed/width. No point doing
1816 	 * this on a link down, since we will be unable to do SA operations,
1817 	 * defaulting to the lowest speed. Also notice that we update our
1818 	 * notion of speed before calling mac_link_update(), which will do
1819 	 * necessary higher level notifications for speed changes.
1820 	 */
1821 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1822 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1823 		state->id_link_speed = ibd_get_portspeed(state);
1824 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1825 	}
1826 
1827 	/*
1828 	 * Do all the work required to establish our presence on
1829 	 * the subnet.
1830 	 */
1831 	if (opcode == IBD_LINK_UP_ABSENT) {
1832 		/*
1833 		 * If in promiscuous mode ...
1834 		 */
1835 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1836 			/*
1837 			 * Drop all nonmembership.
1838 			 */
1839 			ibd_async_unsetprom(state);
1840 
1841 			/*
1842 			 * Then, try to regain nonmembership to all mcg's.
1843 			 */
1844 			ibd_async_setprom(state);
1845 
1846 		}
1847 
1848 		/*
1849 		 * Drop all sendonly membership (which also gets rid of the
1850 		 * AHs); try to reacquire all full membership.
1851 		 */
1852 		mce = list_head(&state->id_mc_full);
1853 		while ((pmce = mce) != NULL) {
1854 			mce = list_next(&state->id_mc_full, mce);
1855 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1856 				ibd_leave_group(state,
1857 				    pmce->mc_info.mc_adds_vect.av_dgid,
1858 				    IB_MC_JSTATE_SEND_ONLY_NON);
1859 			else
1860 				ibd_reacquire_group(state, pmce);
1861 		}
1862 
1863 		/*
1864 		 * Recycle all active AHs to free list (and if there are
1865 		 * pending posts, make sure they will go into the free list
1866 		 * once the Tx's complete). Grab the lock to prevent
1867 		 * concurrent Tx's as well as Tx cleanups.
1868 		 */
1869 		mutex_enter(&state->id_ac_mutex);
1870 		ace = list_head(&state->id_ah_active);
1871 		while ((pace = ace) != NULL) {
1872 			boolean_t cycled;
1873 
1874 			ace = list_next(&state->id_ah_active, ace);
1875 			mce = pace->ac_mce;
1876 			if (pace->ac_chan != NULL) {
1877 				ASSERT(mce == NULL);
1878 				ASSERT(state->id_enable_rc == B_TRUE);
1879 				if (pace->ac_chan->chan_state ==
1880 				    IBD_RC_STATE_ACT_ESTAB) {
1881 					INC_REF(pace, 1);
1882 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
1883 					pace->ac_chan->chan_state =
1884 					    IBD_RC_STATE_ACT_CLOSING;
1885 					ibd_rc_signal_act_close(state, pace);
1886 				} else {
1887 					state->rc_act_close_simultaneous++;
1888 					DPRINT(40, "ibd_async_link: other "
1889 					    "thread is closing it, ace=%p, "
1890 					    "ac_chan=%p, chan_state=%d",
1891 					    pace, pace->ac_chan,
1892 					    pace->ac_chan->chan_state);
1893 				}
1894 			} else {
1895 				cycled = ibd_acache_recycle(state,
1896 				    &pace->ac_mac, B_TRUE);
1897 			}
1898 			/*
1899 			 * If this is for an mcg, it must be for a fullmember,
1900 			 * since we got rid of send-only members above when
1901 			 * processing the mce list.
1902 			 */
1903 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1904 			    IB_MC_JSTATE_FULL)));
1905 
1906 			/*
1907 			 * Check if the fullmember mce needs to be torn down,
1908 			 * ie whether the DLPI disable has already been done.
1909 			 * If so, do some of the work of tx_cleanup, namely
1910 			 * causing leave (which will fail), detach and
1911 			 * mce-freeing. tx_cleanup will put the AH into free
1912 			 * list. The reason to duplicate some of this
1913 			 * tx_cleanup work is because we want to delete the
1914 			 * AH right now instead of waiting for tx_cleanup, to
1915 			 * force subsequent Tx's to reacquire an AH.
1916 			 */
1917 			if ((mce != NULL) && (mce->mc_fullreap))
1918 				ibd_async_reap_group(state, mce,
1919 				    mce->mc_info.mc_adds_vect.av_dgid,
1920 				    mce->mc_jstate);
1921 		}
1922 		mutex_exit(&state->id_ac_mutex);
1923 	}
1924 
1925 	/*
1926 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1927 	 * (which stops further events from being delivered) before
1928 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1929 	 * has already been done.
1930 	 */
1931 	mutex_enter(&state->id_link_mutex);
1932 	state->id_link_state = lstate;
1933 	mac_link_update(state->id_mh, lstate);
1934 	mutex_exit(&state->id_link_mutex);
1935 
1936 	ibd_async_done(state);
1937 }
1938 
1939 /*
1940  * Check the pkey table to see if we can find the pkey we're looking for.
1941  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1942  * failure.
1943  */
1944 static int
1945 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1946     uint16_t *pkix)
1947 {
1948 	uint16_t ndx;
1949 
1950 	ASSERT(pkix != NULL);
1951 
1952 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1953 		if (pkey_tbl[ndx] == pkey) {
1954 			*pkix = ndx;
1955 			return (0);
1956 		}
1957 	}
1958 	return (-1);
1959 }
1960 
1961 /*
1962  * When the link is notified up, we need to do a few things, based
1963  * on the port's current p_init_type_reply claiming a reinit has been
1964  * done or not. The reinit steps are:
1965  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1966  *    the old Pkey and GID0 are correct.
1967  * 2. Register for mcg traps (already done by ibmf).
1968  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1969  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1970  * 4. Give up all sendonly memberships.
1971  * 5. Acquire all full memberships.
1972  * 6. In promiscuous mode, acquire all non memberships.
1973  * 7. Recycle all AHs to free list.
1974  */
1975 static void
1976 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1977 {
1978 	ibt_hca_portinfo_t *port_infop = NULL;
1979 	ibt_status_t ibt_status;
1980 	uint_t psize, port_infosz;
1981 	ibd_link_op_t opcode;
1982 	ibd_req_t *req;
1983 	link_state_t new_link_state = LINK_STATE_UP;
1984 	uint8_t itreply;
1985 	uint16_t pkix;
1986 	int ret;
1987 
1988 	/*
1989 	 * Let's not race with a plumb or an unplumb; if we detect a
1990 	 * pkey relocation event later on here, we may have to restart.
1991 	 */
1992 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1993 
1994 	mutex_enter(&state->id_link_mutex);
1995 
1996 	/*
1997 	 * If the init code in ibd_m_start hasn't yet set up the
1998 	 * pkey/gid, nothing to do; that code will set the link state.
1999 	 */
2000 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2001 		mutex_exit(&state->id_link_mutex);
2002 		goto link_mod_return;
2003 	}
2004 
2005 	/*
2006 	 * If this routine was called in response to a port down event,
2007 	 * we just need to see if this should be informed.
2008 	 */
2009 	if (code == IBT_ERROR_PORT_DOWN) {
2010 		new_link_state = LINK_STATE_DOWN;
2011 		goto update_link_state;
2012 	}
2013 
2014 	/*
2015 	 * If it's not a port down event we've received, try to get the port
2016 	 * attributes first. If we fail here, the port is as good as down.
2017 	 * Otherwise, if the link went down by the time the handler gets
2018 	 * here, give up - we cannot even validate the pkey/gid since those
2019 	 * are not valid and this is as bad as a port down anyway.
2020 	 */
2021 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2022 	    &port_infop, &psize, &port_infosz);
2023 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2024 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2025 		new_link_state = LINK_STATE_DOWN;
2026 		goto update_link_state;
2027 	}
2028 
2029 	/*
2030 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2031 	 * PreserveContentReply are 0, we don't know anything about the
2032 	 * data loaded into the port attributes, so we need to verify
2033 	 * if gid0 and pkey are still valid.
2034 	 */
2035 	itreply = port_infop->p_init_type_reply;
2036 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2037 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2038 		/*
2039 		 * Check to see if the subnet part of GID0 has changed. If
2040 		 * not, check the simple case first to see if the pkey
2041 		 * index is the same as before; finally check to see if the
2042 		 * pkey has been relocated to a different index in the table.
2043 		 */
2044 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2045 		if (bcmp(port_infop->p_sgid_tbl,
2046 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2047 
2048 			new_link_state = LINK_STATE_DOWN;
2049 
2050 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2051 		    state->id_pkey) {
2052 
2053 			new_link_state = LINK_STATE_UP;
2054 
2055 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2056 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2057 
2058 			ibt_free_portinfo(port_infop, port_infosz);
2059 			mutex_exit(&state->id_link_mutex);
2060 
2061 			/*
2062 			 * Currently a restart is required if our pkey has moved
2063 			 * in the pkey table. If we get the ibt_recycle_ud() to
2064 			 * work as documented (expected), we may be able to
2065 			 * avoid a complete restart.  Note that we've already
2066 			 * marked both the start and stop 'in-progress' flags,
2067 			 * so it is ok to go ahead and do this restart.
2068 			 */
2069 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2070 			if ((ret = ibd_start(state)) != 0) {
2071 				DPRINT(10, "ibd_restart: cannot restart, "
2072 				    "ret=%d", ret);
2073 			}
2074 
2075 			goto link_mod_return;
2076 		} else {
2077 			new_link_state = LINK_STATE_DOWN;
2078 		}
2079 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2080 	}
2081 
2082 update_link_state:
2083 	if (port_infop) {
2084 		ibt_free_portinfo(port_infop, port_infosz);
2085 	}
2086 
2087 	/*
2088 	 * If the old state is the same as the new state, nothing to do
2089 	 */
2090 	if (state->id_link_state == new_link_state) {
2091 		mutex_exit(&state->id_link_mutex);
2092 		goto link_mod_return;
2093 	}
2094 
2095 	/*
2096 	 * Ok, so there was a link state change; see if it's safe to ask
2097 	 * the async thread to do the work
2098 	 */
2099 	if (!ibd_async_safe(state)) {
2100 		state->id_link_state = new_link_state;
2101 		mutex_exit(&state->id_link_mutex);
2102 		goto link_mod_return;
2103 	}
2104 
2105 	mutex_exit(&state->id_link_mutex);
2106 
2107 	/*
2108 	 * If we're reporting a link up, check InitTypeReply to see if
2109 	 * the SM has ensured that the port's presence in mcg, traps,
2110 	 * etc. is intact.
2111 	 */
2112 	if (new_link_state == LINK_STATE_DOWN) {
2113 		opcode = IBD_LINK_DOWN;
2114 	} else {
2115 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2116 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2117 			opcode = IBD_LINK_UP;
2118 		} else {
2119 			opcode = IBD_LINK_UP_ABSENT;
2120 		}
2121 	}
2122 
2123 	/*
2124 	 * Queue up a request for ibd_async_link() to handle this link
2125 	 * state change event
2126 	 */
2127 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2128 	req->rq_ptr = (void *)opcode;
2129 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2130 
2131 link_mod_return:
2132 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2133 }
2134 
2135 /*
2136  * For the port up/down events, IBTL guarantees there will not be concurrent
2137  * invocations of the handler. IBTL might coalesce link transition events,
2138  * and not invoke the handler for _each_ up/down transition, but it will
2139  * invoke the handler with last known state
2140  */
2141 static void
2142 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2143     ibt_async_code_t code, ibt_async_event_t *event)
2144 {
2145 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2146 
2147 	switch (code) {
2148 	case IBT_ERROR_CATASTROPHIC_CHAN:
2149 		ibd_print_warn(state, "catastrophic channel error");
2150 		break;
2151 	case IBT_ERROR_CQ:
2152 		ibd_print_warn(state, "completion queue error");
2153 		break;
2154 	case IBT_PORT_CHANGE_EVENT:
2155 		/*
2156 		 * Events will be delivered to all instances that have
2157 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2158 		 * Only need to do work for our port; IBTF will deliver
2159 		 * events for other ports on the hca we have ibt_open_hca'ed
2160 		 * too. Note that id_port is initialized in ibd_attach()
2161 		 * before we do an ibt_open_hca() in ibd_attach().
2162 		 */
2163 		ASSERT(state->id_hca_hdl == hca_hdl);
2164 		if (state->id_port != event->ev_port)
2165 			break;
2166 
2167 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2168 		    IBT_PORT_CHANGE_PKEY) {
2169 			ibd_link_mod(state, code);
2170 		}
2171 		break;
2172 	case IBT_ERROR_PORT_DOWN:
2173 	case IBT_CLNT_REREG_EVENT:
2174 	case IBT_EVENT_PORT_UP:
2175 		/*
2176 		 * Events will be delivered to all instances that have
2177 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2178 		 * Only need to do work for our port; IBTF will deliver
2179 		 * events for other ports on the hca we have ibt_open_hca'ed
2180 		 * too. Note that id_port is initialized in ibd_attach()
2181 		 * before we do an ibt_open_hca() in ibd_attach().
2182 		 */
2183 		ASSERT(state->id_hca_hdl == hca_hdl);
2184 		if (state->id_port != event->ev_port)
2185 			break;
2186 
2187 		ibd_link_mod(state, code);
2188 		break;
2189 
2190 	case IBT_HCA_ATTACH_EVENT:
2191 	case IBT_HCA_DETACH_EVENT:
2192 		/*
2193 		 * When a new card is plugged to the system, attach_event is
2194 		 * invoked. Additionally, a cfgadm needs to be run to make the
2195 		 * card known to the system, and an ifconfig needs to be run to
2196 		 * plumb up any ibd interfaces on the card. In the case of card
2197 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2198 		 * unplumb the ibd interfaces on the card; when the card is
2199 		 * actually unplugged, the detach_event is invoked;
2200 		 * additionally, if any ibd instances are still active on the
2201 		 * card (eg there were no associated RCM scripts), driver's
2202 		 * detach routine is invoked.
2203 		 */
2204 		break;
2205 	default:
2206 		break;
2207 	}
2208 }
2209 
2210 static int
2211 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2212 {
2213 	mac_register_t *macp;
2214 	int ret;
2215 
2216 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2217 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2218 		return (DDI_FAILURE);
2219 	}
2220 
2221 	/*
2222 	 * Note that when we register with mac during attach, we don't
2223 	 * have the id_macaddr yet, so we'll simply be registering a
2224 	 * zero macaddr that we'll overwrite later during plumb (in
2225 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2226 	 * update the mac layer with the correct mtu during plumb.
2227 	 */
2228 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2229 	macp->m_driver = state;
2230 	macp->m_dip = dip;
2231 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2232 	macp->m_callbacks = &ibd_m_callbacks;
2233 	macp->m_min_sdu = 0;
2234 	if (state->id_enable_rc) {
2235 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2236 	} else {
2237 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2238 	}
2239 
2240 	/*
2241 	 *  Register ourselves with the GLDv3 interface
2242 	 */
2243 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2244 		mac_free(macp);
2245 		DPRINT(10,
2246 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2247 		return (DDI_FAILURE);
2248 	}
2249 
2250 	mac_free(macp);
2251 	return (DDI_SUCCESS);
2252 }
2253 
2254 static int
2255 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2256 {
2257 	ibt_hca_attr_t hca_attrs;
2258 	ibt_status_t ibt_status;
2259 
2260 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2261 
2262 	/*
2263 	 * Query the HCA and fetch its attributes
2264 	 */
2265 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2266 	ASSERT(ibt_status == IBT_SUCCESS);
2267 
2268 	/*
2269 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2270 	 *    full checksum offload.
2271 	 */
2272 	if (state->id_enable_rc) {
2273 			state->id_hwcksum_capab = 0;
2274 	} else {
2275 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2276 		    == IBT_HCA_CKSUM_FULL) {
2277 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2278 		}
2279 	}
2280 
2281 	/*
2282 	 * 2. Set LSO policy, capability and maximum length
2283 	 */
2284 	if (state->id_enable_rc) {
2285 		state->id_lso_policy = B_FALSE;
2286 		state->id_lso_capable = B_FALSE;
2287 		state->id_lso_maxlen = 0;
2288 	} else {
2289 		if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS
2290 		    |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2291 			state->id_lso_policy = B_TRUE;
2292 		} else {
2293 			state->id_lso_policy = B_FALSE;
2294 		}
2295 
2296 		if (hca_attrs.hca_max_lso_size > 0) {
2297 			state->id_lso_capable = B_TRUE;
2298 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2299 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2300 			else
2301 				state->id_lso_maxlen =
2302 				    hca_attrs.hca_max_lso_size;
2303 		} else {
2304 			state->id_lso_capable = B_FALSE;
2305 			state->id_lso_maxlen = 0;
2306 		}
2307 	}
2308 
2309 	/*
2310 	 * 3. Set Reserved L_Key capability
2311 	 */
2312 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2313 		state->id_hca_res_lkey_capab = 1;
2314 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2315 		state->rc_enable_iov_map = B_TRUE;
2316 	} else {
2317 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2318 		state->rc_enable_iov_map = B_FALSE;
2319 	}
2320 
2321 	/*
2322 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2323 	 *    size information is provided by the hca
2324 	 */
2325 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2326 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2327 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2328 	} else {
2329 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2330 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2331 	}
2332 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2333 		state->id_max_sqseg = IBD_MAX_SQSEG;
2334 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2335 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2336 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2337 	}
2338 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2339 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2340 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2341 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2342 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2343 	}
2344 
2345 	/*
2346 	 * Translating the virtual address regions into physical regions
2347 	 * for using the Reserved LKey feature results in a wr sgl that
2348 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2349 	 * we'll fix a high-water mark (65%) for when we should stop.
2350 	 */
2351 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2352 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2353 
2354 	/*
2355 	 * 5. Set number of recv and send wqes after checking hca maximum
2356 	 *    channel size
2357 	 */
2358 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2359 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2360 	} else {
2361 		state->id_num_rwqe = IBD_NUM_RWQE;
2362 	}
2363 	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
2364 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2365 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2366 	} else {
2367 		state->id_num_swqe = IBD_NUM_SWQE;
2368 	}
2369 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2370 
2371 	return (DDI_SUCCESS);
2372 }
2373 
2374 static int
2375 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2376 {
2377 	int instance;
2378 	uint32_t progress = state->id_mac_state;
2379 	ibt_status_t ret;
2380 
2381 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2382 		cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n");
2383 		return (DDI_FAILURE);
2384 	}
2385 
2386 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2387 		cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n");
2388 		return (DDI_FAILURE);
2389 	}
2390 
2391 	/* make sure rx resources are freed */
2392 	ibd_free_rx_rsrcs(state);
2393 
2394 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2395 		ASSERT(state->id_enable_rc);
2396 		ibd_rc_fini_srq_list(state);
2397 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2398 	}
2399 
2400 	if (progress & IBD_DRV_MAC_REGISTERED) {
2401 		(void) mac_unregister(state->id_mh);
2402 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2403 	}
2404 
2405 	if (progress & IBD_DRV_PD_ALLOCD) {
2406 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2407 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2408 			ibd_print_warn(state, "failed to free "
2409 			    "protection domain, ret=%d", ret);
2410 		}
2411 		state->id_pd_hdl = NULL;
2412 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2413 	}
2414 
2415 	if (progress & IBD_DRV_HCA_OPENED) {
2416 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2417 		    IBT_SUCCESS) {
2418 			ibd_print_warn(state, "failed to close "
2419 			    "HCA device, ret=%d", ret);
2420 		}
2421 		state->id_hca_hdl = NULL;
2422 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2423 	}
2424 
2425 	mutex_enter(&ibd_gstate.ig_mutex);
2426 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2427 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2428 		    IBT_SUCCESS) {
2429 			ibd_print_warn(state,
2430 			    "ibt_detach() failed, ret=%d", ret);
2431 		}
2432 		state->id_ibt_hdl = NULL;
2433 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2434 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2435 	}
2436 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2437 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2438 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2439 		    IBT_SUCCESS) {
2440 			ibd_print_warn(state, "ibt_detach(): global "
2441 			    "failed, ret=%d", ret);
2442 		}
2443 		ibd_gstate.ig_ibt_hdl = NULL;
2444 	}
2445 	mutex_exit(&ibd_gstate.ig_mutex);
2446 
2447 	if (progress & IBD_DRV_TXINTR_ADDED) {
2448 		ddi_remove_softintr(state->id_tx);
2449 		state->id_tx = NULL;
2450 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2451 	}
2452 
2453 	if (progress & IBD_DRV_RXINTR_ADDED) {
2454 		ddi_remove_softintr(state->id_rx);
2455 		state->id_rx = NULL;
2456 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2457 	}
2458 
2459 #ifdef DEBUG
2460 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2461 		kstat_delete(state->rc_ksp);
2462 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2463 	}
2464 #endif
2465 
2466 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2467 		ibd_state_fini(state);
2468 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2469 	}
2470 
2471 	instance = ddi_get_instance(dip);
2472 	ddi_soft_state_free(ibd_list, instance);
2473 
2474 	return (DDI_SUCCESS);
2475 }
2476 
2477 /*
2478  * Attach device to the IO framework.
2479  */
2480 static int
2481 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2482 {
2483 	ibd_state_t *state = NULL;
2484 	ib_guid_t hca_guid;
2485 	int instance;
2486 	ibt_status_t ret;
2487 	int rv;
2488 
2489 	/*
2490 	 * IBD doesn't support suspend/resume
2491 	 */
2492 	if (cmd != DDI_ATTACH)
2493 		return (DDI_FAILURE);
2494 
2495 	/*
2496 	 * Allocate softstate structure
2497 	 */
2498 	instance = ddi_get_instance(dip);
2499 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2500 		return (DDI_FAILURE);
2501 	state = ddi_get_soft_state(ibd_list, instance);
2502 
2503 	/*
2504 	 * Initialize mutexes and condition variables
2505 	 */
2506 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2507 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2508 		goto attach_fail;
2509 	}
2510 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2511 
2512 	/*
2513 	 * Allocate rx,tx softintr
2514 	 */
2515 	if (ibd_rx_softintr == 1) {
2516 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2517 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2518 			DPRINT(10, "ibd_attach: failed in "
2519 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2520 			goto attach_fail;
2521 		}
2522 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2523 	}
2524 	if (ibd_tx_softintr == 1) {
2525 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2526 		    NULL, NULL, ibd_tx_recycle,
2527 		    (caddr_t)state)) != DDI_SUCCESS) {
2528 			DPRINT(10, "ibd_attach: failed in "
2529 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2530 			goto attach_fail;
2531 		}
2532 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2533 	}
2534 
2535 	/*
2536 	 * Obtain IBA P_Key, port number and HCA guid and validate
2537 	 * them (for P_Key, only full members are allowed as per
2538 	 * IPoIB specification; neither port number nor HCA guid
2539 	 * can be zero)
2540 	 */
2541 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2542 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2543 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2544 		    state->id_pkey);
2545 		goto attach_fail;
2546 	}
2547 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2548 	    "port-number", 0)) == 0) {
2549 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2550 		    state->id_port);
2551 		goto attach_fail;
2552 	}
2553 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2554 	    "hca-guid", 0)) == 0) {
2555 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2556 		    hca_guid);
2557 		goto attach_fail;
2558 	}
2559 
2560 	/*
2561 	 * Attach to IBTL
2562 	 */
2563 	mutex_enter(&ibd_gstate.ig_mutex);
2564 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2565 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2566 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2567 			DPRINT(10, "ibd_attach: global: failed in "
2568 			    "ibt_attach(), ret=%d", ret);
2569 			mutex_exit(&ibd_gstate.ig_mutex);
2570 			goto attach_fail;
2571 		}
2572 	}
2573 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2574 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2575 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d",
2576 		    ret);
2577 		mutex_exit(&ibd_gstate.ig_mutex);
2578 		goto attach_fail;
2579 	}
2580 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2581 	mutex_exit(&ibd_gstate.ig_mutex);
2582 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2583 
2584 	/*
2585 	 * Open the HCA
2586 	 */
2587 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2588 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2589 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2590 		goto attach_fail;
2591 	}
2592 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2593 
2594 	/* Get RC config before ibd_record_capab */
2595 	ibd_rc_get_conf(state);
2596 
2597 #ifdef DEBUG
2598 	/* Initialize Driver Counters for Reliable Connected Mode */
2599 	if (state->id_enable_rc) {
2600 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2601 			DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats");
2602 			goto attach_fail;
2603 		}
2604 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2605 	}
2606 #endif
2607 
2608 	/*
2609 	 * Record capabilities
2610 	 */
2611 	(void) ibd_record_capab(state, dip);
2612 
2613 	/*
2614 	 * Allocate a protection domain on the HCA
2615 	 */
2616 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2617 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2618 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2619 		goto attach_fail;
2620 	}
2621 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2622 
2623 
2624 	/*
2625 	 * Register ibd interfaces with the Nemo framework
2626 	 */
2627 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2628 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2629 		goto attach_fail;
2630 	}
2631 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2632 
2633 	/*
2634 	 * We're done with everything we could to make the attach
2635 	 * succeed.  All the buffer allocations and IPoIB broadcast
2636 	 * group joins are deferred to when the interface instance
2637 	 * is actually plumbed to avoid wasting memory.
2638 	 */
2639 	return (DDI_SUCCESS);
2640 
2641 attach_fail:
2642 	(void) ibd_unattach(state, dip);
2643 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2644 	return (DDI_FAILURE);
2645 }
2646 
2647 /*
2648  * Detach device from the IO framework.
2649  */
2650 static int
2651 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2652 {
2653 	ibd_state_t *state;
2654 	int instance;
2655 
2656 	/*
2657 	 * IBD doesn't support suspend/resume
2658 	 */
2659 	if (cmd != DDI_DETACH)
2660 		return (DDI_FAILURE);
2661 
2662 	/*
2663 	 * Get the instance softstate
2664 	 */
2665 	instance = ddi_get_instance(dip);
2666 	state = ddi_get_soft_state(ibd_list, instance);
2667 
2668 	/*
2669 	 * Release all resources we're holding still.  Note that if we'd
2670 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2671 	 * so far, we should find all the flags we need in id_mac_state.
2672 	 */
2673 	return (ibd_unattach(state, dip));
2674 }
2675 
2676 /*
2677  * Pre ibt_attach() driver initialization
2678  */
2679 static int
2680 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2681 {
2682 	char buf[64];
2683 
2684 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2685 	state->id_link_state = LINK_STATE_UNKNOWN;
2686 
2687 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2688 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2689 	state->id_trap_stop = B_TRUE;
2690 	state->id_trap_inprog = 0;
2691 
2692 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2693 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2694 	state->id_dip = dip;
2695 
2696 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2697 
2698 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2699 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2700 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2701 	state->id_tx_busy = 0;
2702 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2703 
2704 	state->id_rx_list.dl_bufs_outstanding = 0;
2705 	state->id_rx_list.dl_cnt = 0;
2706 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2707 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2708 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2709 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2710 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2711 
2712 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2713 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2714 
2715 	/* For Reliable Connected Mode */
2716 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2717 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2718 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2719 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2720 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2721 	    MUTEX_DRIVER, NULL);
2722 
2723 	return (DDI_SUCCESS);
2724 }
2725 
2726 /*
2727  * Post ibt_detach() driver deconstruction
2728  */
2729 static void
2730 ibd_state_fini(ibd_state_t *state)
2731 {
2732 	cv_destroy(&state->id_macst_cv);
2733 	mutex_destroy(&state->id_macst_lock);
2734 
2735 	kmem_cache_destroy(state->id_req_kmc);
2736 
2737 	mutex_destroy(&state->id_rx_list.dl_mutex);
2738 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2739 
2740 	mutex_destroy(&state->id_txpost_lock);
2741 	mutex_destroy(&state->id_tx_list.dl_mutex);
2742 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2743 	mutex_destroy(&state->id_lso_lock);
2744 
2745 	mutex_destroy(&state->id_sched_lock);
2746 	mutex_destroy(&state->id_scq_poll_lock);
2747 	mutex_destroy(&state->id_rcq_poll_lock);
2748 
2749 	cv_destroy(&state->id_trap_cv);
2750 	mutex_destroy(&state->id_trap_lock);
2751 	mutex_destroy(&state->id_link_mutex);
2752 
2753 	/* For Reliable Connected Mode */
2754 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2755 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2756 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2757 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2758 	mutex_destroy(&state->rc_rx_lock);
2759 }
2760 
2761 /*
2762  * Fetch link speed from SA for snmp ifspeed reporting.
2763  */
2764 static uint64_t
2765 ibd_get_portspeed(ibd_state_t *state)
2766 {
2767 	int			ret;
2768 	ibt_path_info_t		path;
2769 	ibt_path_attr_t		path_attr;
2770 	uint8_t			num_paths;
2771 	uint64_t		ifspeed;
2772 
2773 	/*
2774 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2775 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2776 	 * 2000000000. Start with that as default.
2777 	 */
2778 	ifspeed = 2000000000;
2779 
2780 	bzero(&path_attr, sizeof (path_attr));
2781 
2782 	/*
2783 	 * Get the port speed from Loopback path information.
2784 	 */
2785 	path_attr.pa_dgids = &state->id_sgid;
2786 	path_attr.pa_num_dgids = 1;
2787 	path_attr.pa_sgid = state->id_sgid;
2788 
2789 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2790 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2791 		goto earlydone;
2792 
2793 	if (num_paths < 1)
2794 		goto earlydone;
2795 
2796 	/*
2797 	 * In case SA does not return an expected value, report the default
2798 	 * speed as 1X.
2799 	 */
2800 	ret = 1;
2801 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2802 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2803 			ret = 1;
2804 			break;
2805 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2806 			ret = 4;
2807 			break;
2808 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2809 			ret = 12;
2810 			break;
2811 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2812 			ret = 2;
2813 			break;
2814 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2815 			ret = 8;
2816 			break;
2817 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2818 			ret = 16;
2819 			break;
2820 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2821 			ret = 24;
2822 			break;
2823 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2824 			ret = 32;
2825 			break;
2826 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2827 			ret = 48;
2828 			break;
2829 	}
2830 
2831 	ifspeed *= ret;
2832 
2833 earlydone:
2834 	return (ifspeed);
2835 }
2836 
2837 /*
2838  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2839  * representing the input mcg mgid.
2840  */
2841 static ibd_mce_t *
2842 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2843 {
2844 	ibd_mce_t *ptr = list_head(mlist);
2845 
2846 	/*
2847 	 * Do plain linear search.
2848 	 */
2849 	while (ptr != NULL) {
2850 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2851 		    sizeof (ib_gid_t)) == 0)
2852 			return (ptr);
2853 		ptr = list_next(mlist, ptr);
2854 	}
2855 	return (NULL);
2856 }
2857 
2858 /*
2859  * Execute IBA JOIN.
2860  */
2861 static ibt_status_t
2862 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2863 {
2864 	ibt_mcg_attr_t mcg_attr;
2865 
2866 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2867 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2868 	mcg_attr.mc_mgid = mgid;
2869 	mcg_attr.mc_join_state = mce->mc_jstate;
2870 	mcg_attr.mc_scope = state->id_scope;
2871 	mcg_attr.mc_pkey = state->id_pkey;
2872 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2873 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2874 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2875 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2876 	    NULL, NULL));
2877 }
2878 
2879 /*
2880  * This code JOINs the port in the proper way (depending on the join
2881  * state) so that IBA fabric will forward mcg packets to/from the port.
2882  * It also attaches the QPN to the mcg so it can receive those mcg
2883  * packets. This code makes sure not to attach the mcg to the QP if
2884  * that has been previously done due to the mcg being joined with a
2885  * different join state, even though this is not required by SWG_0216,
2886  * refid 3610.
2887  */
2888 static ibd_mce_t *
2889 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2890 {
2891 	ibt_status_t ibt_status;
2892 	ibd_mce_t *mce, *tmce, *omce = NULL;
2893 	boolean_t do_attach = B_TRUE;
2894 
2895 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2896 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2897 
2898 	/*
2899 	 * For enable_multicast Full member joins, we need to do some
2900 	 * extra work. If there is already an mce on the list that
2901 	 * indicates full membership, that means the membership has
2902 	 * not yet been dropped (since the disable_multicast was issued)
2903 	 * because there are pending Tx's to the mcg; in that case, just
2904 	 * mark the mce not to be reaped when the Tx completion queues
2905 	 * an async reap operation.
2906 	 *
2907 	 * If there is already an mce on the list indicating sendonly
2908 	 * membership, try to promote to full membership. Be careful
2909 	 * not to deallocate the old mce, since there might be an AH
2910 	 * pointing to it; instead, update the old mce with new data
2911 	 * that tracks the full membership.
2912 	 */
2913 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2914 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2915 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2916 			ASSERT(omce->mc_fullreap);
2917 			omce->mc_fullreap = B_FALSE;
2918 			return (omce);
2919 		} else {
2920 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2921 		}
2922 	}
2923 
2924 	/*
2925 	 * Allocate the ibd_mce_t to track this JOIN.
2926 	 */
2927 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2928 	mce->mc_fullreap = B_FALSE;
2929 	mce->mc_jstate = jstate;
2930 
2931 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2932 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2933 		    ibt_status);
2934 		kmem_free(mce, sizeof (ibd_mce_t));
2935 		return (NULL);
2936 	}
2937 
2938 	/*
2939 	 * Is an IBA attach required? Not if the interface is already joined
2940 	 * to the mcg in a different appropriate join state.
2941 	 */
2942 	if (jstate == IB_MC_JSTATE_NON) {
2943 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2944 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2945 			do_attach = B_FALSE;
2946 	} else if (jstate == IB_MC_JSTATE_FULL) {
2947 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2948 			do_attach = B_FALSE;
2949 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2950 		do_attach = B_FALSE;
2951 	}
2952 
2953 	if (do_attach) {
2954 		/*
2955 		 * Do the IBA attach.
2956 		 */
2957 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2958 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2959 		    &mce->mc_info)) != IBT_SUCCESS) {
2960 			DPRINT(10, "ibd_join_group : failed qp attachment "
2961 			    "%d\n", ibt_status);
2962 			/*
2963 			 * NOTE that we should probably preserve the join info
2964 			 * in the list and later try to leave again at detach
2965 			 * time.
2966 			 */
2967 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2968 			    state->id_sgid, jstate);
2969 			kmem_free(mce, sizeof (ibd_mce_t));
2970 			return (NULL);
2971 		}
2972 	}
2973 
2974 	/*
2975 	 * Insert the ibd_mce_t in the proper list.
2976 	 */
2977 	if (jstate == IB_MC_JSTATE_NON) {
2978 		IBD_MCACHE_INSERT_NON(state, mce);
2979 	} else {
2980 		/*
2981 		 * Set up the mc_req fields used for reaping the
2982 		 * mcg in case of delayed tx completion (see
2983 		 * ibd_tx_cleanup()). Also done for sendonly join in
2984 		 * case we are promoted to fullmembership later and
2985 		 * keep using the same mce.
2986 		 */
2987 		mce->mc_req.rq_gid = mgid;
2988 		mce->mc_req.rq_ptr = mce;
2989 		/*
2990 		 * Check whether this is the case of trying to join
2991 		 * full member, and we were already joined send only.
2992 		 * We try to drop our SendOnly membership, but it is
2993 		 * possible that the mcg does not exist anymore (and
2994 		 * the subnet trap never reached us), so the leave
2995 		 * operation might fail.
2996 		 */
2997 		if (omce != NULL) {
2998 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2999 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3000 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3001 			bcopy(&mce->mc_info, &omce->mc_info,
3002 			    sizeof (ibt_mcg_info_t));
3003 			kmem_free(mce, sizeof (ibd_mce_t));
3004 			return (omce);
3005 		}
3006 		mutex_enter(&state->id_mc_mutex);
3007 		IBD_MCACHE_INSERT_FULL(state, mce);
3008 		mutex_exit(&state->id_mc_mutex);
3009 	}
3010 
3011 	return (mce);
3012 }
3013 
3014 /*
3015  * Called during port up event handling to attempt to reacquire full
3016  * membership to an mcg. Stripped down version of ibd_join_group().
3017  * Note that it is possible that the mcg might have gone away, and
3018  * gets recreated at this point.
3019  */
3020 static void
3021 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3022 {
3023 	ib_gid_t mgid;
3024 
3025 	/*
3026 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3027 	 * reap/leave is going to try to leave the group. We could prevent
3028 	 * that by adding a boolean flag into ibd_mce_t, if required.
3029 	 */
3030 	if (mce->mc_fullreap)
3031 		return;
3032 
3033 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3034 
3035 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3036 	    mgid.gid_guid);
3037 
3038 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3039 		ibd_print_warn(state, "Failure on port up to rejoin "
3040 		    "multicast gid %016llx:%016llx",
3041 		    (u_longlong_t)mgid.gid_prefix,
3042 		    (u_longlong_t)mgid.gid_guid);
3043 }
3044 
3045 /*
3046  * This code handles delayed Tx completion cleanups for mcg's to which
3047  * disable_multicast has been issued, regular mcg related cleanups during
3048  * disable_multicast, disable_promiscuous and mcg traps, as well as
3049  * cleanups during driver detach time. Depending on the join state,
3050  * it deletes the mce from the appropriate list and issues the IBA
3051  * leave/detach; except in the disable_multicast case when the mce
3052  * is left on the active list for a subsequent Tx completion cleanup.
3053  */
3054 static void
3055 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3056     uint8_t jstate)
3057 {
3058 	ibd_mce_t *tmce;
3059 	boolean_t do_detach = B_TRUE;
3060 
3061 	/*
3062 	 * Before detaching, we must check whether the other list
3063 	 * contains the mcg; if we detach blindly, the consumer
3064 	 * who set up the other list will also stop receiving
3065 	 * traffic.
3066 	 */
3067 	if (jstate == IB_MC_JSTATE_FULL) {
3068 		/*
3069 		 * The following check is only relevant while coming
3070 		 * from the Tx completion path in the reap case.
3071 		 */
3072 		if (!mce->mc_fullreap)
3073 			return;
3074 		mutex_enter(&state->id_mc_mutex);
3075 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3076 		mutex_exit(&state->id_mc_mutex);
3077 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3078 			do_detach = B_FALSE;
3079 	} else if (jstate == IB_MC_JSTATE_NON) {
3080 		IBD_MCACHE_PULLOUT_NON(state, mce);
3081 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3082 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3083 			do_detach = B_FALSE;
3084 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3085 		mutex_enter(&state->id_mc_mutex);
3086 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3087 		mutex_exit(&state->id_mc_mutex);
3088 		do_detach = B_FALSE;
3089 	}
3090 
3091 	/*
3092 	 * If we are reacting to a mcg trap and leaving our sendonly or
3093 	 * non membership, the mcg is possibly already gone, so attempting
3094 	 * to leave might fail. On the other hand, we must try to leave
3095 	 * anyway, since this might be a trap from long ago, and we could
3096 	 * have potentially sendonly joined to a recent incarnation of
3097 	 * the mcg and are about to loose track of this information.
3098 	 */
3099 	if (do_detach) {
3100 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3101 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3102 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3103 	}
3104 
3105 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3106 	kmem_free(mce, sizeof (ibd_mce_t));
3107 }
3108 
3109 /*
3110  * Async code executed due to multicast and promiscuous disable requests
3111  * and mcg trap handling; also executed during driver detach. Mostly, a
3112  * leave and detach is done; except for the fullmember case when Tx
3113  * requests are pending, whence arrangements are made for subsequent
3114  * cleanup on Tx completion.
3115  */
3116 static void
3117 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3118 {
3119 	ipoib_mac_t mcmac;
3120 	boolean_t recycled;
3121 	ibd_mce_t *mce;
3122 
3123 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3124 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3125 
3126 	if (jstate == IB_MC_JSTATE_NON) {
3127 		recycled = B_TRUE;
3128 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3129 		/*
3130 		 * In case we are handling a mcg trap, we might not find
3131 		 * the mcg in the non list.
3132 		 */
3133 		if (mce == NULL) {
3134 			return;
3135 		}
3136 	} else {
3137 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3138 
3139 		/*
3140 		 * In case we are handling a mcg trap, make sure the trap
3141 		 * is not arriving late; if we have an mce that indicates
3142 		 * that we are already a fullmember, that would be a clear
3143 		 * indication that the trap arrived late (ie, is for a
3144 		 * previous incarnation of the mcg).
3145 		 */
3146 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3147 			if ((mce == NULL) || (mce->mc_jstate ==
3148 			    IB_MC_JSTATE_FULL)) {
3149 				return;
3150 			}
3151 		} else {
3152 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3153 
3154 			/*
3155 			 * If join group failed, mce will be NULL here.
3156 			 * This is because in GLDv3 driver, set multicast
3157 			 *  will always return success.
3158 			 */
3159 			if (mce == NULL) {
3160 				return;
3161 			}
3162 
3163 			mce->mc_fullreap = B_TRUE;
3164 		}
3165 
3166 		/*
3167 		 * If no pending Tx's remain that reference the AH
3168 		 * for the mcg, recycle it from active to free list.
3169 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3170 		 * so the last completing Tx will cause an async reap
3171 		 * operation to be invoked, at which time we will drop our
3172 		 * membership to the mcg so that the pending Tx's complete
3173 		 * successfully. Refer to comments on "AH and MCE active
3174 		 * list manipulation" at top of this file. The lock protects
3175 		 * against Tx fast path and Tx cleanup code.
3176 		 */
3177 		mutex_enter(&state->id_ac_mutex);
3178 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3179 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3180 		    IB_MC_JSTATE_SEND_ONLY_NON));
3181 		mutex_exit(&state->id_ac_mutex);
3182 	}
3183 
3184 	if (recycled) {
3185 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3186 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3187 		ibd_async_reap_group(state, mce, mgid, jstate);
3188 	}
3189 }
3190 
3191 /*
3192  * Find the broadcast address as defined by IPoIB; implicitly
3193  * determines the IBA scope, mtu, tclass etc of the link the
3194  * interface is going to be a member of.
3195  */
3196 static ibt_status_t
3197 ibd_find_bgroup(ibd_state_t *state)
3198 {
3199 	ibt_mcg_attr_t mcg_attr;
3200 	uint_t numg;
3201 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3202 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3203 	    IB_MC_SCOPE_GLOBAL };
3204 	int i, mcgmtu;
3205 	boolean_t found = B_FALSE;
3206 	int ret;
3207 	ibt_mcg_info_t mcg_info;
3208 
3209 	state->id_bgroup_created = B_FALSE;
3210 
3211 query_bcast_grp:
3212 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3213 	mcg_attr.mc_pkey = state->id_pkey;
3214 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3215 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3216 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3217 
3218 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3219 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3220 
3221 		/*
3222 		 * Look for the IPoIB broadcast group.
3223 		 */
3224 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3225 		state->id_mgid.gid_prefix =
3226 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3227 		    ((uint64_t)state->id_scope << 48) |
3228 		    ((uint32_t)(state->id_pkey << 16)));
3229 		mcg_attr.mc_mgid = state->id_mgid;
3230 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3231 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3232 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3233 			found = B_TRUE;
3234 			break;
3235 		}
3236 	}
3237 
3238 	if (!found) {
3239 		if (ibd_create_broadcast_group) {
3240 			/*
3241 			 * If we created the broadcast group, but failed to
3242 			 * find it, we can't do anything except leave the
3243 			 * one we created and return failure.
3244 			 */
3245 			if (state->id_bgroup_created) {
3246 				ibd_print_warn(state, "IPoIB broadcast group "
3247 				    "absent. Unable to query after create.");
3248 				goto find_bgroup_fail;
3249 			}
3250 
3251 			/*
3252 			 * Create the ipoib broadcast group if it didn't exist
3253 			 */
3254 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3255 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3256 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3257 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3258 			mcg_attr.mc_pkey = state->id_pkey;
3259 			mcg_attr.mc_flow = 0;
3260 			mcg_attr.mc_sl = 0;
3261 			mcg_attr.mc_tclass = 0;
3262 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3263 			state->id_mgid.gid_prefix =
3264 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3265 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3266 			    ((uint32_t)(state->id_pkey << 16)));
3267 			mcg_attr.mc_mgid = state->id_mgid;
3268 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3269 
3270 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3271 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3272 				ibd_print_warn(state, "IPoIB broadcast group "
3273 				    "absent, create failed: ret = %d\n", ret);
3274 				state->id_bgroup_created = B_FALSE;
3275 				return (IBT_FAILURE);
3276 			}
3277 			state->id_bgroup_created = B_TRUE;
3278 			goto query_bcast_grp;
3279 		} else {
3280 			ibd_print_warn(state, "IPoIB broadcast group absent");
3281 			return (IBT_FAILURE);
3282 		}
3283 	}
3284 
3285 	/*
3286 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3287 	 */
3288 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3289 	if (state->id_mtu < mcgmtu) {
3290 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3291 		    "greater than port's maximum MTU %d", mcgmtu,
3292 		    state->id_mtu);
3293 		ibt_free_mcg_info(state->id_mcinfo, 1);
3294 		goto find_bgroup_fail;
3295 	}
3296 	state->id_mtu = mcgmtu;
3297 
3298 	return (IBT_SUCCESS);
3299 
3300 find_bgroup_fail:
3301 	if (state->id_bgroup_created) {
3302 		(void) ibt_leave_mcg(state->id_sgid,
3303 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3304 		    IB_MC_JSTATE_FULL);
3305 	}
3306 
3307 	return (IBT_FAILURE);
3308 }
3309 
3310 static int
3311 ibd_alloc_tx_copybufs(ibd_state_t *state)
3312 {
3313 	ibt_mr_attr_t mem_attr;
3314 
3315 	/*
3316 	 * Allocate one big chunk for all regular tx copy bufs
3317 	 */
3318 	state->id_tx_buf_sz = state->id_mtu;
3319 	if (state->id_lso_policy && state->id_lso_capable &&
3320 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3321 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3322 	}
3323 
3324 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3325 	    state->id_tx_buf_sz, KM_SLEEP);
3326 
3327 	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
3328 	    sizeof (ibd_swqe_t), KM_SLEEP);
3329 
3330 	/*
3331 	 * Do one memory registration on the entire txbuf area
3332 	 */
3333 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3334 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3335 	mem_attr.mr_as = NULL;
3336 	mem_attr.mr_flags = IBT_MR_SLEEP;
3337 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3338 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3339 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3340 		kmem_free(state->id_tx_wqes,
3341 		    state->id_num_swqe * sizeof (ibd_swqe_t));
3342 		kmem_free(state->id_tx_bufs,
3343 		    state->id_num_swqe * state->id_tx_buf_sz);
3344 		state->id_tx_bufs = NULL;
3345 		return (DDI_FAILURE);
3346 	}
3347 
3348 	return (DDI_SUCCESS);
3349 }
3350 
3351 static int
3352 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3353 {
3354 	ibt_mr_attr_t mem_attr;
3355 	ibd_lsobuf_t *buflist;
3356 	ibd_lsobuf_t *lbufp;
3357 	ibd_lsobuf_t *tail;
3358 	ibd_lsobkt_t *bktp;
3359 	uint8_t *membase;
3360 	uint8_t *memp;
3361 	uint_t memsz;
3362 	int i;
3363 
3364 	/*
3365 	 * Allocate the lso bucket
3366 	 */
3367 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3368 
3369 	/*
3370 	 * Allocate the entire lso memory and register it
3371 	 */
3372 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3373 	membase = kmem_zalloc(memsz, KM_SLEEP);
3374 
3375 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3376 	mem_attr.mr_len = memsz;
3377 	mem_attr.mr_as = NULL;
3378 	mem_attr.mr_flags = IBT_MR_SLEEP;
3379 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3380 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3381 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3382 		kmem_free(membase, memsz);
3383 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3384 		return (DDI_FAILURE);
3385 	}
3386 
3387 	mutex_enter(&state->id_lso_lock);
3388 
3389 	/*
3390 	 * Now allocate the buflist.  Note that the elements in the buflist and
3391 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3392 	 * can always derive the address of a buflist entry from the address of
3393 	 * an lso buffer.
3394 	 */
3395 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3396 	    KM_SLEEP);
3397 
3398 	/*
3399 	 * Set up the lso buf chain
3400 	 */
3401 	memp = membase;
3402 	lbufp = buflist;
3403 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3404 		lbufp->lb_isfree = 1;
3405 		lbufp->lb_buf = memp;
3406 		lbufp->lb_next = lbufp + 1;
3407 
3408 		tail = lbufp;
3409 
3410 		memp += IBD_LSO_BUFSZ;
3411 		lbufp++;
3412 	}
3413 	tail->lb_next = NULL;
3414 
3415 	/*
3416 	 * Set up the LSO buffer information in ibd state
3417 	 */
3418 	bktp->bkt_bufl = buflist;
3419 	bktp->bkt_free_head = buflist;
3420 	bktp->bkt_mem = membase;
3421 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3422 	bktp->bkt_nfree = bktp->bkt_nelem;
3423 
3424 	state->id_lso = bktp;
3425 	mutex_exit(&state->id_lso_lock);
3426 
3427 	return (DDI_SUCCESS);
3428 }
3429 
3430 /*
3431  * Statically allocate Tx buffer list(s).
3432  */
3433 static int
3434 ibd_init_txlist(ibd_state_t *state)
3435 {
3436 	ibd_swqe_t *swqe;
3437 	ibt_lkey_t lkey;
3438 	int i;
3439 	uint_t len;
3440 	uint8_t *bufaddr;
3441 
3442 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3443 		return (DDI_FAILURE);
3444 
3445 	if (state->id_lso_policy && state->id_lso_capable) {
3446 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3447 			state->id_lso_policy = B_FALSE;
3448 	}
3449 
3450 	mutex_enter(&state->id_tx_list.dl_mutex);
3451 	state->id_tx_list.dl_head = NULL;
3452 	state->id_tx_list.dl_pending_sends = B_FALSE;
3453 	state->id_tx_list.dl_cnt = 0;
3454 	mutex_exit(&state->id_tx_list.dl_mutex);
3455 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3456 	state->id_tx_rel_list.dl_head = NULL;
3457 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3458 	state->id_tx_rel_list.dl_cnt = 0;
3459 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3460 
3461 	/*
3462 	 * Allocate and setup the swqe list
3463 	 */
3464 	lkey = state->id_tx_mr_desc.md_lkey;
3465 	bufaddr = state->id_tx_bufs;
3466 	len = state->id_tx_buf_sz;
3467 	swqe = state->id_tx_wqes;
3468 	mutex_enter(&state->id_tx_list.dl_mutex);
3469 	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
3470 		swqe->swqe_next = NULL;
3471 		swqe->swqe_im_mblk = NULL;
3472 
3473 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3474 		    bufaddr;
3475 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3476 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3477 
3478 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3479 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3480 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3481 
3482 		/* These are set in send */
3483 		swqe->w_swr.wr_nds = 0;
3484 		swqe->w_swr.wr_sgl = NULL;
3485 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3486 
3487 		/* add to list */
3488 		state->id_tx_list.dl_cnt++;
3489 		swqe->swqe_next = state->id_tx_list.dl_head;
3490 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3491 	}
3492 	mutex_exit(&state->id_tx_list.dl_mutex);
3493 
3494 	return (DDI_SUCCESS);
3495 }
3496 
3497 static int
3498 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3499     uint32_t *nds_p)
3500 {
3501 	ibd_lsobkt_t *bktp;
3502 	ibd_lsobuf_t *lbufp;
3503 	ibd_lsobuf_t *nextp;
3504 	ibt_lkey_t lso_lkey;
3505 	uint_t frag_sz;
3506 	uint_t num_needed;
3507 	int i;
3508 
3509 	ASSERT(sgl_p != NULL);
3510 	ASSERT(nds_p != NULL);
3511 	ASSERT(req_sz != 0);
3512 
3513 	/*
3514 	 * Determine how many bufs we'd need for the size requested
3515 	 */
3516 	num_needed = req_sz / IBD_LSO_BUFSZ;
3517 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3518 		num_needed++;
3519 
3520 	mutex_enter(&state->id_lso_lock);
3521 
3522 	/*
3523 	 * If we don't have enough lso bufs, return failure
3524 	 */
3525 	ASSERT(state->id_lso != NULL);
3526 	bktp = state->id_lso;
3527 	if (bktp->bkt_nfree < num_needed) {
3528 		mutex_exit(&state->id_lso_lock);
3529 		return (-1);
3530 	}
3531 
3532 	/*
3533 	 * Pick the first 'num_needed' bufs from the free list
3534 	 */
3535 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3536 	lbufp = bktp->bkt_free_head;
3537 	for (i = 0; i < num_needed; i++) {
3538 		ASSERT(lbufp->lb_isfree != 0);
3539 		ASSERT(lbufp->lb_buf != NULL);
3540 
3541 		nextp = lbufp->lb_next;
3542 
3543 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3544 		sgl_p[i].ds_key = lso_lkey;
3545 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3546 
3547 		lbufp->lb_isfree = 0;
3548 		lbufp->lb_next = NULL;
3549 
3550 		lbufp = nextp;
3551 	}
3552 	bktp->bkt_free_head = lbufp;
3553 
3554 	/*
3555 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3556 	 * to adjust the last sgl entry's length. Since we know we need atleast
3557 	 * one, the i-1 use below is ok.
3558 	 */
3559 	if (frag_sz) {
3560 		sgl_p[i-1].ds_len = frag_sz;
3561 	}
3562 
3563 	/*
3564 	 * Update nfree count and return
3565 	 */
3566 	bktp->bkt_nfree -= num_needed;
3567 
3568 	mutex_exit(&state->id_lso_lock);
3569 
3570 	*nds_p = num_needed;
3571 
3572 	return (0);
3573 }
3574 
3575 static void
3576 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3577 {
3578 	ibd_lsobkt_t *bktp;
3579 	ibd_lsobuf_t *lbufp;
3580 	uint8_t *lso_mem_end;
3581 	uint_t ndx;
3582 	int i;
3583 
3584 	mutex_enter(&state->id_lso_lock);
3585 
3586 	bktp = state->id_lso;
3587 	ASSERT(bktp != NULL);
3588 
3589 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3590 	for (i = 0; i < nds; i++) {
3591 		uint8_t *va;
3592 
3593 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3594 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3595 
3596 		/*
3597 		 * Figure out the buflist element this sgl buffer corresponds
3598 		 * to and put it back at the head
3599 		 */
3600 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3601 		lbufp = bktp->bkt_bufl + ndx;
3602 
3603 		ASSERT(lbufp->lb_isfree == 0);
3604 		ASSERT(lbufp->lb_buf == va);
3605 
3606 		lbufp->lb_isfree = 1;
3607 		lbufp->lb_next = bktp->bkt_free_head;
3608 		bktp->bkt_free_head = lbufp;
3609 	}
3610 	bktp->bkt_nfree += nds;
3611 
3612 	mutex_exit(&state->id_lso_lock);
3613 }
3614 
3615 static void
3616 ibd_free_tx_copybufs(ibd_state_t *state)
3617 {
3618 	/*
3619 	 * Unregister txbuf mr
3620 	 */
3621 	if (ibt_deregister_mr(state->id_hca_hdl,
3622 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3623 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3624 	}
3625 	state->id_tx_mr_hdl = NULL;
3626 
3627 	/*
3628 	 * Free txbuf memory
3629 	 */
3630 	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
3631 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3632 	state->id_tx_wqes = NULL;
3633 	state->id_tx_bufs = NULL;
3634 }
3635 
3636 static void
3637 ibd_free_tx_lsobufs(ibd_state_t *state)
3638 {
3639 	ibd_lsobkt_t *bktp;
3640 
3641 	mutex_enter(&state->id_lso_lock);
3642 
3643 	if ((bktp = state->id_lso) == NULL) {
3644 		mutex_exit(&state->id_lso_lock);
3645 		return;
3646 	}
3647 
3648 	/*
3649 	 * First, free the buflist
3650 	 */
3651 	ASSERT(bktp->bkt_bufl != NULL);
3652 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3653 
3654 	/*
3655 	 * Unregister the LSO memory and free it
3656 	 */
3657 	ASSERT(bktp->bkt_mr_hdl != NULL);
3658 	if (ibt_deregister_mr(state->id_hca_hdl,
3659 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3660 		DPRINT(10,
3661 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3662 	}
3663 	ASSERT(bktp->bkt_mem);
3664 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3665 
3666 	/*
3667 	 * Finally free the bucket
3668 	 */
3669 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3670 	state->id_lso = NULL;
3671 
3672 	mutex_exit(&state->id_lso_lock);
3673 }
3674 
3675 /*
3676  * Free the statically allocated Tx buffer list.
3677  */
3678 static void
3679 ibd_fini_txlist(ibd_state_t *state)
3680 {
3681 	/*
3682 	 * Free the allocated swqes
3683 	 */
3684 	mutex_enter(&state->id_tx_list.dl_mutex);
3685 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3686 	state->id_tx_list.dl_head = NULL;
3687 	state->id_tx_list.dl_pending_sends = B_FALSE;
3688 	state->id_tx_list.dl_cnt = 0;
3689 	state->id_tx_rel_list.dl_head = NULL;
3690 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3691 	state->id_tx_rel_list.dl_cnt = 0;
3692 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3693 	mutex_exit(&state->id_tx_list.dl_mutex);
3694 
3695 	ibd_free_tx_lsobufs(state);
3696 	ibd_free_tx_copybufs(state);
3697 }
3698 
3699 /*
3700  * post a list of rwqes, NULL terminated.
3701  */
3702 static void
3703 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3704 {
3705 	uint_t		i;
3706 	uint_t		num_posted;
3707 	ibt_status_t	ibt_status;
3708 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3709 
3710 	while (rwqe) {
3711 		/* Post up to IBD_RX_POST_CNT receive work requests */
3712 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3713 			wrs[i] = rwqe->w_rwr;
3714 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3715 			if (rwqe == NULL) {
3716 				i++;
3717 				break;
3718 			}
3719 		}
3720 
3721 		/*
3722 		 * If posting fails for some reason, we'll never receive
3723 		 * completion intimation, so we'll need to cleanup. But
3724 		 * we need to make sure we don't clean up nodes whose
3725 		 * wrs have been successfully posted. We assume that the
3726 		 * hca driver returns on the first failure to post and
3727 		 * therefore the first 'num_posted' entries don't need
3728 		 * cleanup here.
3729 		 */
3730 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3731 
3732 		num_posted = 0;
3733 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3734 		    &num_posted);
3735 		if (ibt_status != IBT_SUCCESS) {
3736 			/* This cannot happen unless the device has an error. */
3737 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3738 			    "posting multiple wrs failed: "
3739 			    "requested=%d, done=%d, ret=%d",
3740 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3741 			atomic_add_32(&state->id_rx_list.dl_cnt,
3742 			    num_posted - i);
3743 		}
3744 	}
3745 }
3746 
3747 /*
3748  * Grab a list of rwqes from the array of lists, and post the list.
3749  */
3750 static void
3751 ibd_post_recv_intr(ibd_state_t *state)
3752 {
3753 	ibd_rx_queue_t	*rxp;
3754 	ibd_rwqe_t *list;
3755 
3756 	/* rotate through the rx_queue array, expecting an adequate number */
3757 	state->id_rx_post_queue_index =
3758 	    (state->id_rx_post_queue_index + 1) &
3759 	    (state->id_rx_nqueues - 1);
3760 
3761 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3762 	mutex_enter(&rxp->rx_post_lock);
3763 	list = WQE_TO_RWQE(rxp->rx_head);
3764 	rxp->rx_head = NULL;
3765 	rxp->rx_cnt = 0;
3766 	mutex_exit(&rxp->rx_post_lock);
3767 	ibd_post_recv_list(state, list);
3768 }
3769 
3770 /* macro explained below */
3771 #define	RX_QUEUE_HASH(rwqe) \
3772 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3773 
3774 /*
3775  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3776  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3777  *
3778  * Note: one of 2^N lists is chosen via a hash.  This is done
3779  * because using one list is contentious.  If the first list is busy
3780  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3781  *
3782  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3783  * even distribution of mapping rwqes to the 2^N queues.
3784  */
3785 static void
3786 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3787 {
3788 	ibd_rx_queue_t	*rxp;
3789 
3790 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3791 
3792 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3793 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3794 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3795 		mutex_enter(&rxp->rx_post_lock);
3796 	}
3797 	rwqe->rwqe_next = rxp->rx_head;
3798 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3799 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
3800 
3801 		/* only call ibt_post_recv() every Nth time through here */
3802 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
3803 			rxp->rx_head = NULL;
3804 			rxp->rx_cnt = 0;
3805 			mutex_exit(&rxp->rx_post_lock);
3806 			ibd_post_recv_list(state, rwqe);
3807 			return;
3808 		}
3809 	}
3810 	rxp->rx_head = RWQE_TO_WQE(rwqe);
3811 	mutex_exit(&rxp->rx_post_lock);
3812 }
3813 
3814 static int
3815 ibd_alloc_rx_copybufs(ibd_state_t *state)
3816 {
3817 	ibt_mr_attr_t mem_attr;
3818 	int i;
3819 
3820 	/*
3821 	 * Allocate one big chunk for all regular rx copy bufs
3822 	 */
3823 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3824 
3825 	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
3826 	    state->id_rx_buf_sz, KM_SLEEP);
3827 
3828 	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
3829 	    sizeof (ibd_rwqe_t), KM_SLEEP);
3830 
3831 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3832 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3833 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
3834 	for (i = 0; i < state->id_rx_nqueues; i++) {
3835 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3836 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3837 	}
3838 
3839 	/*
3840 	 * Do one memory registration on the entire rxbuf area
3841 	 */
3842 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3843 	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
3844 	mem_attr.mr_as = NULL;
3845 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3846 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3847 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3848 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3849 		kmem_free(state->id_rx_wqes,
3850 		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
3851 		kmem_free(state->id_rx_bufs,
3852 		    state->id_num_rwqe * state->id_rx_buf_sz);
3853 		state->id_rx_bufs = NULL;
3854 		state->id_rx_wqes = NULL;
3855 		return (DDI_FAILURE);
3856 	}
3857 
3858 	return (DDI_SUCCESS);
3859 }
3860 
3861 /*
3862  * Allocate the statically allocated Rx buffer list.
3863  */
3864 static int
3865 ibd_init_rxlist(ibd_state_t *state)
3866 {
3867 	ibd_rwqe_t *rwqe, *next;
3868 	ibd_wqe_t *list;
3869 	ibt_lkey_t lkey;
3870 	int i;
3871 	uint_t len;
3872 	uint8_t *bufaddr;
3873 
3874 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3875 	if (state->id_rx_free_list.dl_head != NULL) {
3876 		/* rx rsrcs were never freed.  Just repost them */
3877 		len = state->id_rx_buf_sz;
3878 		list = state->id_rx_free_list.dl_head;
3879 		state->id_rx_free_list.dl_head = NULL;
3880 		state->id_rx_free_list.dl_cnt = 0;
3881 		mutex_exit(&state->id_rx_free_list.dl_mutex);
3882 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3883 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3884 			if ((rwqe->rwqe_im_mblk = desballoc(
3885 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
3886 			    &rwqe->w_freemsg_cb)) == NULL) {
3887 				/* allow freemsg_cb to free the rwqes */
3888 				if (atomic_dec_32_nv(&state->id_running) != 0) {
3889 					cmn_err(CE_WARN, "ibd_init_rxlist: "
3890 					    "id_running was not 1\n");
3891 				}
3892 				DPRINT(10, "ibd_init_rxlist : "
3893 				    "failed in desballoc()");
3894 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3895 				    rwqe = next) {
3896 					next = WQE_TO_RWQE(rwqe->rwqe_next);
3897 					if (rwqe->rwqe_im_mblk) {
3898 						atomic_inc_32(&state->
3899 						    id_rx_list.
3900 						    dl_bufs_outstanding);
3901 						freemsg(rwqe->rwqe_im_mblk);
3902 					} else
3903 						ibd_free_rwqe(state, rwqe);
3904 				}
3905 				atomic_inc_32(&state->id_running);
3906 				return (DDI_FAILURE);
3907 			}
3908 		}
3909 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
3910 		return (DDI_SUCCESS);
3911 	}
3912 	mutex_exit(&state->id_rx_free_list.dl_mutex);
3913 
3914 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3915 		return (DDI_FAILURE);
3916 
3917 	/*
3918 	 * Allocate and setup the rwqe list
3919 	 */
3920 	len = state->id_rx_buf_sz;
3921 	lkey = state->id_rx_mr_desc.md_lkey;
3922 	rwqe = state->id_rx_wqes;
3923 	bufaddr = state->id_rx_bufs;
3924 	list = NULL;
3925 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
3926 		rwqe->w_state = state;
3927 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3928 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3929 
3930 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3931 
3932 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3933 		    &rwqe->w_freemsg_cb)) == NULL) {
3934 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3935 			/* allow freemsg_cb to free the rwqes */
3936 			if (atomic_dec_32_nv(&state->id_running) != 0) {
3937 				cmn_err(CE_WARN, "ibd_init_rxlist: "
3938 				    "id_running was not 1\n");
3939 			}
3940 			DPRINT(10, "ibd_init_rxlist : "
3941 			    "failed in desballoc()");
3942 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3943 			    rwqe = next) {
3944 				next = WQE_TO_RWQE(rwqe->rwqe_next);
3945 				freemsg(rwqe->rwqe_im_mblk);
3946 			}
3947 			atomic_inc_32(&state->id_running);
3948 
3949 			/* remove reference to free'd rwqes */
3950 			mutex_enter(&state->id_rx_free_list.dl_mutex);
3951 			state->id_rx_free_list.dl_head = NULL;
3952 			state->id_rx_free_list.dl_cnt = 0;
3953 			mutex_exit(&state->id_rx_free_list.dl_mutex);
3954 
3955 			ibd_fini_rxlist(state);
3956 			return (DDI_FAILURE);
3957 		}
3958 
3959 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3960 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
3961 		    (ib_vaddr_t)(uintptr_t)bufaddr;
3962 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3963 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3964 		rwqe->w_rwr.wr_nds = 1;
3965 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3966 
3967 		rwqe->rwqe_next = list;
3968 		list = RWQE_TO_WQE(rwqe);
3969 	}
3970 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
3971 
3972 	return (DDI_SUCCESS);
3973 }
3974 
3975 static void
3976 ibd_free_rx_copybufs(ibd_state_t *state)
3977 {
3978 	int i;
3979 
3980 	/*
3981 	 * Unregister rxbuf mr
3982 	 */
3983 	if (ibt_deregister_mr(state->id_hca_hdl,
3984 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
3985 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3986 	}
3987 	state->id_rx_mr_hdl = NULL;
3988 
3989 	/*
3990 	 * Free rxbuf memory
3991 	 */
3992 	for (i = 0; i < state->id_rx_nqueues; i++) {
3993 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3994 		mutex_destroy(&rxp->rx_post_lock);
3995 	}
3996 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
3997 	    sizeof (ibd_rx_queue_t));
3998 	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
3999 	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
4000 	state->id_rx_queues = NULL;
4001 	state->id_rx_wqes = NULL;
4002 	state->id_rx_bufs = NULL;
4003 }
4004 
4005 static void
4006 ibd_free_rx_rsrcs(ibd_state_t *state)
4007 {
4008 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4009 	if (state->id_rx_free_list.dl_head == NULL) {
4010 		/* already freed */
4011 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4012 		return;
4013 	}
4014 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe);
4015 	ibd_free_rx_copybufs(state);
4016 	state->id_rx_free_list.dl_cnt = 0;
4017 	state->id_rx_free_list.dl_head = NULL;
4018 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4019 }
4020 
4021 /*
4022  * Free the statically allocated Rx buffer list.
4023  */
4024 static void
4025 ibd_fini_rxlist(ibd_state_t *state)
4026 {
4027 	ibd_rwqe_t *rwqe;
4028 	int i;
4029 
4030 	/* run through the rx_queue's, calling freemsg() */
4031 	for (i = 0; i < state->id_rx_nqueues; i++) {
4032 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4033 		mutex_enter(&rxp->rx_post_lock);
4034 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4035 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4036 			freemsg(rwqe->rwqe_im_mblk);
4037 			rxp->rx_cnt--;
4038 		}
4039 		rxp->rx_head = NULL;
4040 		mutex_exit(&rxp->rx_post_lock);
4041 	}
4042 
4043 	/* cannot free rx resources unless gld returned everything */
4044 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4045 		ibd_free_rx_rsrcs(state);
4046 }
4047 
4048 /*
4049  * Free an allocated recv wqe.
4050  */
4051 /* ARGSUSED */
4052 static void
4053 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4054 {
4055 	/*
4056 	 * desballoc() failed (no memory).
4057 	 *
4058 	 * This rwqe is placed on a free list so that it
4059 	 * can be reinstated when memory is available.
4060 	 *
4061 	 * NOTE: no code currently exists to reinstate
4062 	 * these "lost" rwqes.
4063 	 */
4064 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4065 	state->id_rx_free_list.dl_cnt++;
4066 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4067 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4068 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4069 }
4070 
4071 /*
4072  * IBA Rx completion queue handler. Guaranteed to be single
4073  * threaded and nonreentrant for this CQ.
4074  */
4075 /* ARGSUSED */
4076 static void
4077 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4078 {
4079 	ibd_state_t *state = (ibd_state_t *)arg;
4080 
4081 	atomic_inc_64(&state->id_num_intrs);
4082 
4083 	if (ibd_rx_softintr == 1) {
4084 		mutex_enter(&state->id_rcq_poll_lock);
4085 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4086 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4087 			mutex_exit(&state->id_rcq_poll_lock);
4088 			return;
4089 		} else {
4090 			mutex_exit(&state->id_rcq_poll_lock);
4091 			ddi_trigger_softintr(state->id_rx);
4092 		}
4093 	} else
4094 		(void) ibd_intr((caddr_t)state);
4095 }
4096 
4097 /*
4098  * CQ handler for Tx completions, when the Tx CQ is in
4099  * interrupt driven mode.
4100  */
4101 /* ARGSUSED */
4102 static void
4103 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4104 {
4105 	ibd_state_t *state = (ibd_state_t *)arg;
4106 
4107 	atomic_inc_64(&state->id_num_intrs);
4108 
4109 	if (ibd_tx_softintr == 1) {
4110 		mutex_enter(&state->id_scq_poll_lock);
4111 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4112 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4113 			mutex_exit(&state->id_scq_poll_lock);
4114 			return;
4115 		} else {
4116 			mutex_exit(&state->id_scq_poll_lock);
4117 			ddi_trigger_softintr(state->id_tx);
4118 		}
4119 	} else
4120 		(void) ibd_tx_recycle((caddr_t)state);
4121 }
4122 
4123 /*
4124  * Multicast group create/delete trap handler. These will be delivered
4125  * on a kernel thread (handling can thus block) and can be invoked
4126  * concurrently. The handler can be invoked anytime after it is
4127  * registered and before ibt_detach().
4128  */
4129 /* ARGSUSED */
4130 static void
4131 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4132     ibt_subnet_event_t *event)
4133 {
4134 	ibd_state_t *state = (ibd_state_t *)arg;
4135 	ibd_req_t *req;
4136 
4137 	/*
4138 	 * The trap handler will get invoked once for every event for
4139 	 * every port. The input "gid" is the GID0 of the port the
4140 	 * trap came in on; we just need to act on traps that came
4141 	 * to our port, meaning the port on which the ipoib interface
4142 	 * resides. Since ipoib uses GID0 of the port, we just match
4143 	 * the gids to check whether we need to handle the trap.
4144 	 */
4145 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4146 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4147 		return;
4148 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4149 
4150 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4151 
4152 	switch (code) {
4153 		case IBT_SM_EVENT_UNAVAILABLE:
4154 			/*
4155 			 * If we are in promiscuous mode or have
4156 			 * sendnonmembers, we need to print a warning
4157 			 * message right now. Else, just store the
4158 			 * information, print when we enter promiscuous
4159 			 * mode or attempt nonmember send. We might
4160 			 * also want to stop caching sendnonmember.
4161 			 */
4162 			ibd_print_warn(state, "IBA multicast support "
4163 			    "degraded due to unavailability of multicast "
4164 			    "traps");
4165 			break;
4166 		case IBT_SM_EVENT_AVAILABLE:
4167 			/*
4168 			 * If we printed a warning message above or
4169 			 * while trying to nonmember send or get into
4170 			 * promiscuous mode, print an okay message.
4171 			 */
4172 			ibd_print_warn(state, "IBA multicast support "
4173 			    "restored due to availability of multicast "
4174 			    "traps");
4175 			break;
4176 		case IBT_SM_EVENT_MCG_CREATED:
4177 		case IBT_SM_EVENT_MCG_DELETED:
4178 			/*
4179 			 * Common processing of creation/deletion traps.
4180 			 * First check if the instance is being
4181 			 * [de]initialized; back off then, without doing
4182 			 * anything more, since we are not sure if the
4183 			 * async thread is around, or whether we might
4184 			 * be racing with the detach code in ibd_m_stop()
4185 			 * that scans the mcg list.
4186 			 */
4187 			if (!ibd_async_safe(state))
4188 				return;
4189 
4190 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4191 			req->rq_gid = event->sm_notice_gid;
4192 			req->rq_ptr = (void *)code;
4193 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4194 			break;
4195 	}
4196 }
4197 
4198 static void
4199 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4200 {
4201 	ib_gid_t mgid = req->rq_gid;
4202 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4203 
4204 	DPRINT(10, "ibd_async_trap : %d\n", code);
4205 
4206 	/*
4207 	 * Atomically search the nonmember and sendonlymember lists and
4208 	 * delete.
4209 	 */
4210 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4211 
4212 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4213 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4214 
4215 		/*
4216 		 * If in promiscuous mode, try to join/attach to the new
4217 		 * mcg. Given the unreliable out-of-order mode of trap
4218 		 * delivery, we can never be sure whether it is a problem
4219 		 * if the join fails. Thus, we warn the admin of a failure
4220 		 * if this was a creation trap. Note that the trap might
4221 		 * actually be reporting a long past event, and the mcg
4222 		 * might already have been deleted, thus we might be warning
4223 		 * in vain.
4224 		 */
4225 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4226 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4227 			ibd_print_warn(state, "IBA promiscuous mode missed "
4228 			    "new multicast gid %016llx:%016llx",
4229 			    (u_longlong_t)mgid.gid_prefix,
4230 			    (u_longlong_t)mgid.gid_guid);
4231 	}
4232 
4233 	/*
4234 	 * Free the request slot allocated by the subnet event thread.
4235 	 */
4236 	ibd_async_done(state);
4237 }
4238 
4239 /*
4240  * GLDv3 entry point to get capabilities.
4241  */
4242 static boolean_t
4243 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4244 {
4245 	ibd_state_t *state = arg;
4246 
4247 	switch (cap) {
4248 	case MAC_CAPAB_HCKSUM: {
4249 		uint32_t *txflags = cap_data;
4250 
4251 		/*
4252 		 * We either do full checksum or not do it at all
4253 		 */
4254 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4255 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4256 		else
4257 			return (B_FALSE);
4258 		break;
4259 	}
4260 
4261 	case MAC_CAPAB_LSO: {
4262 		mac_capab_lso_t *cap_lso = cap_data;
4263 
4264 		/*
4265 		 * In addition to the capability and policy, since LSO
4266 		 * relies on hw checksum, we'll not enable LSO if we
4267 		 * don't have hw checksum.  Of course, if the HCA doesn't
4268 		 * provide the reserved lkey capability, enabling LSO will
4269 		 * actually affect performance adversely, so we'll disable
4270 		 * LSO even for that case.
4271 		 */
4272 		if (!state->id_lso_policy || !state->id_lso_capable)
4273 			return (B_FALSE);
4274 
4275 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4276 			return (B_FALSE);
4277 
4278 		if (state->id_hca_res_lkey_capab == 0) {
4279 			ibd_print_warn(state, "no reserved-lkey capability, "
4280 			    "disabling LSO");
4281 			return (B_FALSE);
4282 		}
4283 
4284 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4285 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4286 		break;
4287 	}
4288 
4289 	default:
4290 		return (B_FALSE);
4291 	}
4292 
4293 	return (B_TRUE);
4294 }
4295 
4296 static int
4297 ibd_get_port_details(ibd_state_t *state)
4298 {
4299 	ibt_hca_portinfo_t *port_infop;
4300 	ibt_status_t ret;
4301 	uint_t psize, port_infosz;
4302 
4303 	mutex_enter(&state->id_link_mutex);
4304 
4305 	/*
4306 	 * Query for port information
4307 	 */
4308 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4309 	    &port_infop, &psize, &port_infosz);
4310 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4311 		mutex_exit(&state->id_link_mutex);
4312 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4313 		    "failed, ret=%d", ret);
4314 		return (ENETDOWN);
4315 	}
4316 
4317 	/*
4318 	 * If the link already went down by the time we get here,
4319 	 * give up
4320 	 */
4321 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4322 		mutex_exit(&state->id_link_mutex);
4323 		ibt_free_portinfo(port_infop, port_infosz);
4324 		DPRINT(10, "ibd_get_port_details: port is not active");
4325 		return (ENETDOWN);
4326 	}
4327 
4328 	/*
4329 	 * If the link is active, verify the pkey
4330 	 */
4331 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4332 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4333 		mutex_exit(&state->id_link_mutex);
4334 		ibt_free_portinfo(port_infop, port_infosz);
4335 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4336 		    "failed, ret=%d", ret);
4337 		return (ENONET);
4338 	}
4339 
4340 	state->id_mtu = (128 << port_infop->p_mtu);
4341 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4342 	state->id_sgid = *port_infop->p_sgid_tbl;
4343 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4344 	state->id_link_state = LINK_STATE_UP;
4345 
4346 	mutex_exit(&state->id_link_mutex);
4347 	ibt_free_portinfo(port_infop, port_infosz);
4348 
4349 	/*
4350 	 * Now that the port is active, record the port speed
4351 	 */
4352 	state->id_link_speed = ibd_get_portspeed(state);
4353 
4354 	return (0);
4355 }
4356 
4357 static int
4358 ibd_alloc_cqs(ibd_state_t *state)
4359 {
4360 	ibt_hca_attr_t hca_attrs;
4361 	ibt_cq_attr_t cq_attr;
4362 	ibt_status_t ret;
4363 	uint32_t real_size;
4364 
4365 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4366 	ASSERT(ret == IBT_SUCCESS);
4367 
4368 	/*
4369 	 * Allocate Rx/combined CQ:
4370 	 * Theoretically, there is no point in having more than #rwqe
4371 	 * plus #swqe cqe's, except that the CQ will be signaled for
4372 	 * overflow when the last wqe completes, if none of the previous
4373 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4374 	 * to make sure such overflow does not occur.
4375 	 */
4376 	cq_attr.cq_sched = NULL;
4377 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4378 
4379 	/*
4380 	 * Allocate Receive CQ.
4381 	 */
4382 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4383 		cq_attr.cq_size = state->id_num_rwqe + 1;
4384 	} else {
4385 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4386 		state->id_num_rwqe = cq_attr.cq_size - 1;
4387 	}
4388 
4389 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4390 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4391 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4392 		    "failed, ret=%d\n", ret);
4393 		return (DDI_FAILURE);
4394 	}
4395 
4396 	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4397 	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4398 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4399 		    "moderation failed, ret=%d\n", ret);
4400 	}
4401 
4402 	/* make the #rx wc's the same as max rx chain size */
4403 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
4404 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4405 	    state->id_rxwcs_size, KM_SLEEP);
4406 
4407 	/*
4408 	 * Allocate Send CQ.
4409 	 */
4410 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4411 		cq_attr.cq_size = state->id_num_swqe + 1;
4412 	} else {
4413 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4414 		state->id_num_swqe = cq_attr.cq_size - 1;
4415 	}
4416 
4417 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4418 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4419 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4420 		    "failed, ret=%d\n", ret);
4421 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4422 		    state->id_rxwcs_size);
4423 		(void) ibt_free_cq(state->id_rcq_hdl);
4424 		return (DDI_FAILURE);
4425 	}
4426 	if ((ret = ibt_modify_cq(state->id_scq_hdl,
4427 	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
4428 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4429 		    "moderation failed, ret=%d\n", ret);
4430 	}
4431 
4432 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
4433 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4434 	    state->id_txwcs_size, KM_SLEEP);
4435 
4436 	/*
4437 	 * Print message in case we could not allocate as many wqe's
4438 	 * as was requested.
4439 	 */
4440 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4441 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4442 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4443 	}
4444 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4445 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4446 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4447 	}
4448 
4449 	return (DDI_SUCCESS);
4450 }
4451 
4452 static int
4453 ibd_setup_ud_channel(ibd_state_t *state)
4454 {
4455 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4456 	ibt_ud_chan_query_attr_t ud_chan_attr;
4457 	ibt_status_t ret;
4458 
4459 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
4460 	if (state->id_hca_res_lkey_capab)
4461 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4462 	if (state->id_lso_policy && state->id_lso_capable)
4463 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4464 
4465 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4466 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4467 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4468 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4469 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4470 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4471 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4472 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4473 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4474 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4475 	ud_alloc_attr.ud_clone_chan	= NULL;
4476 
4477 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4478 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4479 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4480 		    "failed, ret=%d\n", ret);
4481 		return (DDI_FAILURE);
4482 	}
4483 
4484 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4485 	    &ud_chan_attr)) != IBT_SUCCESS) {
4486 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4487 		    "failed, ret=%d\n", ret);
4488 		(void) ibt_free_channel(state->id_chnl_hdl);
4489 		return (DDI_FAILURE);
4490 	}
4491 
4492 	state->id_qpnum = ud_chan_attr.ud_qpn;
4493 
4494 	return (DDI_SUCCESS);
4495 }
4496 
4497 static int
4498 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4499 {
4500 	uint32_t progress = state->id_mac_state;
4501 	uint_t attempts;
4502 	ibt_status_t ret;
4503 	ib_gid_t mgid;
4504 	ibd_mce_t *mce;
4505 	uint8_t jstate;
4506 
4507 	if (atomic_dec_32_nv(&state->id_running) != 0)
4508 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
4509 
4510 	/*
4511 	 * Before we try to stop/undo whatever we did in ibd_start(),
4512 	 * we need to mark the link state appropriately to prevent the
4513 	 * ip layer from using this instance for any new transfers. Note
4514 	 * that if the original state of the link was "up" when we're
4515 	 * here, we'll set the final link state to "unknown", to behave
4516 	 * in the same fashion as other ethernet drivers.
4517 	 */
4518 	mutex_enter(&state->id_link_mutex);
4519 	if (cur_link_state == LINK_STATE_DOWN) {
4520 		state->id_link_state = cur_link_state;
4521 	} else {
4522 		state->id_link_state = LINK_STATE_UNKNOWN;
4523 	}
4524 	mutex_exit(&state->id_link_mutex);
4525 	mac_link_update(state->id_mh, state->id_link_state);
4526 
4527 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4528 	if (progress & IBD_DRV_STARTED) {
4529 		state->id_mac_state &= (~IBD_DRV_STARTED);
4530 	}
4531 
4532 	/* Stop listen under Reliable Connected Mode */
4533 	if (progress & IBD_DRV_RC_LISTEN) {
4534 		ASSERT(state->id_enable_rc);
4535 		if (state->rc_listen_hdl != NULL) {
4536 			ibd_rc_stop_listen(state);
4537 		}
4538 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
4539 	}
4540 
4541 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
4542 		(void) ibd_rc_close_all_chan(state);
4543 	}
4544 
4545 	/*
4546 	 * First, stop receive interrupts; this stops the driver from
4547 	 * handing up buffers to higher layers.  Wait for receive buffers
4548 	 * to be returned and give up after 1 second.
4549 	 */
4550 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4551 		attempts = 10;
4552 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
4553 		    0) > 0) {
4554 			delay(drv_usectohz(100000));
4555 			if (--attempts == 0) {
4556 				/*
4557 				 * There are pending bufs with the network
4558 				 * layer and we have no choice but to wait
4559 				 * for them to be done with. Reap all the
4560 				 * Tx/Rx completions that were posted since
4561 				 * we turned off the notification and
4562 				 * return failure.
4563 				 */
4564 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
4565 				DPRINT(2, "ibd_undo_start: "
4566 				    "reclaiming failed");
4567 				break;
4568 			}
4569 		}
4570 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4571 	}
4572 
4573 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
4574 		ibd_rc_fini_tx_largebuf_list(state);
4575 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
4576 	}
4577 
4578 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
4579 		ASSERT(state->id_enable_rc);
4580 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
4581 			ibd_rc_fini_srq_list(state);
4582 			state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
4583 		} else {
4584 			cmn_err(CE_CONT, "ibd_undo_start: srq bufs "
4585 			    "outstanding\n");
4586 		}
4587 	}
4588 
4589 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4590 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4591 
4592 		mutex_enter(&state->id_trap_lock);
4593 		state->id_trap_stop = B_TRUE;
4594 		while (state->id_trap_inprog > 0)
4595 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4596 		mutex_exit(&state->id_trap_lock);
4597 
4598 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4599 	}
4600 
4601 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4602 		/*
4603 		 * Flushing the channel ensures that all pending WQE's
4604 		 * are marked with flush_error and handed to the CQ. It
4605 		 * does not guarantee the invocation of the CQ handler.
4606 		 * This call is guaranteed to return successfully for
4607 		 * UD QPNs.
4608 		 */
4609 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4610 		    IBT_SUCCESS) {
4611 			DPRINT(10, "ibd_undo_start: flush_channel "
4612 			    "failed, ret=%d", ret);
4613 		}
4614 
4615 		/*
4616 		 * Give some time for the TX CQ handler to process the
4617 		 * completions.
4618 		 */
4619 		mutex_enter(&state->id_tx_list.dl_mutex);
4620 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
4621 		attempts = 10;
4622 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
4623 		    != state->id_num_swqe) {
4624 			if (--attempts == 0)
4625 				break;
4626 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
4627 			mutex_exit(&state->id_tx_list.dl_mutex);
4628 			delay(drv_usectohz(100000));
4629 			mutex_enter(&state->id_tx_list.dl_mutex);
4630 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
4631 		}
4632 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4633 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
4634 		    state->id_num_swqe) {
4635 			cmn_err(CE_WARN, "tx resources not freed\n");
4636 		}
4637 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
4638 		mutex_exit(&state->id_tx_list.dl_mutex);
4639 
4640 		attempts = 10;
4641 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4642 			if (--attempts == 0)
4643 				break;
4644 			delay(drv_usectohz(100000));
4645 		}
4646 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4647 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4648 			cmn_err(CE_WARN, "rx resources not freed\n");
4649 		}
4650 
4651 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4652 	}
4653 
4654 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4655 		/*
4656 		 * No new async requests will be posted since the device
4657 		 * link state has been marked as unknown; completion handlers
4658 		 * have been turned off, so Tx handler will not cause any
4659 		 * more IBD_ASYNC_REAP requests.
4660 		 *
4661 		 * Queue a request for the async thread to exit, which will
4662 		 * be serviced after any pending ones. This can take a while,
4663 		 * specially if the SM is unreachable, since IBMF will slowly
4664 		 * timeout each SM request issued by the async thread.  Reap
4665 		 * the thread before continuing on, we do not want it to be
4666 		 * lingering in modunloaded code (or we could move the reap
4667 		 * to ibd_detach(), provided we keep track of the current
4668 		 * id_async_thrid somewhere safe).
4669 		 */
4670 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4671 		thread_join(state->id_async_thrid);
4672 
4673 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4674 	}
4675 
4676 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4677 		/*
4678 		 * Drop all residual full/non membership. This includes full
4679 		 * membership to the broadcast group, and any nonmembership
4680 		 * acquired during transmits. We do this after the Tx completion
4681 		 * handlers are done, since those might result in some late
4682 		 * leaves; this also eliminates a potential race with that
4683 		 * path wrt the mc full list insert/delete. Trap handling
4684 		 * has also been suppressed at this point. Thus, no locks
4685 		 * are required while traversing the mc full list.
4686 		 */
4687 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4688 		mce = list_head(&state->id_mc_full);
4689 		while (mce != NULL) {
4690 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4691 			jstate = mce->mc_jstate;
4692 			mce = list_next(&state->id_mc_full, mce);
4693 			ibd_leave_group(state, mgid, jstate);
4694 		}
4695 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4696 	}
4697 
4698 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4699 		ibd_fini_rxlist(state);
4700 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4701 	}
4702 
4703 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4704 		ibd_fini_txlist(state);
4705 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4706 	}
4707 
4708 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4709 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4710 		    IBT_SUCCESS) {
4711 			DPRINT(10, "ibd_undo_start: free_channel "
4712 			    "failed, ret=%d", ret);
4713 		}
4714 
4715 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4716 	}
4717 
4718 	if (progress & IBD_DRV_CQS_ALLOCD) {
4719 		kmem_free(state->id_txwcs,
4720 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
4721 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4722 		    IBT_SUCCESS) {
4723 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
4724 			    "failed, ret=%d", ret);
4725 		}
4726 
4727 		kmem_free(state->id_rxwcs,
4728 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4729 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4730 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4731 			    "ret=%d", ret);
4732 		}
4733 
4734 		state->id_txwcs = NULL;
4735 		state->id_rxwcs = NULL;
4736 		state->id_scq_hdl = NULL;
4737 		state->id_rcq_hdl = NULL;
4738 
4739 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4740 	}
4741 
4742 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4743 		mutex_enter(&state->id_ac_mutex);
4744 		mod_hash_destroy_hash(state->id_ah_active_hash);
4745 		mutex_exit(&state->id_ac_mutex);
4746 		ibd_acache_fini(state);
4747 
4748 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4749 	}
4750 
4751 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4752 		/*
4753 		 * If we'd created the ipoib broadcast group and had
4754 		 * successfully joined it, leave it now
4755 		 */
4756 		if (state->id_bgroup_created) {
4757 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4758 			jstate = IB_MC_JSTATE_FULL;
4759 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4760 			    state->id_sgid, jstate);
4761 		}
4762 		ibt_free_mcg_info(state->id_mcinfo, 1);
4763 
4764 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4765 	}
4766 
4767 	return (DDI_SUCCESS);
4768 }
4769 
4770 /*
4771  * These pair of routines are used to set/clear the condition that
4772  * the caller is likely to do something to change the id_mac_state.
4773  * If there's already someone doing either a start or a stop (possibly
4774  * due to the async handler detecting a pkey relocation event, a plumb
4775  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4776  * that's done.
4777  */
4778 static void
4779 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4780 {
4781 	mutex_enter(&state->id_macst_lock);
4782 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4783 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4784 
4785 	state->id_mac_state |= flag;
4786 	mutex_exit(&state->id_macst_lock);
4787 }
4788 
4789 static void
4790 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4791 {
4792 	mutex_enter(&state->id_macst_lock);
4793 	state->id_mac_state &= (~flag);
4794 	cv_signal(&state->id_macst_cv);
4795 	mutex_exit(&state->id_macst_lock);
4796 }
4797 
4798 /*
4799  * GLDv3 entry point to start hardware.
4800  */
4801 /*ARGSUSED*/
4802 static int
4803 ibd_m_start(void *arg)
4804 {
4805 	ibd_state_t *state = arg;
4806 	int	ret;
4807 
4808 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4809 
4810 	ret = ibd_start(state);
4811 
4812 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4813 
4814 	return (ret);
4815 }
4816 
4817 static int
4818 ibd_start(ibd_state_t *state)
4819 {
4820 	kthread_t *kht;
4821 	int err;
4822 	ibt_status_t ret;
4823 
4824 	if (state->id_mac_state & IBD_DRV_STARTED)
4825 		return (DDI_SUCCESS);
4826 
4827 	if (atomic_inc_32_nv(&state->id_running) != 1) {
4828 		DPRINT(10, "ibd_start: id_running is non-zero");
4829 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
4830 		atomic_dec_32(&state->id_running);
4831 		return (EINVAL);
4832 	}
4833 
4834 	/*
4835 	 * Get port details; if we fail here, very likely the port
4836 	 * state is inactive or the pkey can't be verified.
4837 	 */
4838 	if ((err = ibd_get_port_details(state)) != 0) {
4839 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4840 		goto start_fail;
4841 	}
4842 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4843 
4844 	/*
4845 	 * Find the IPoIB broadcast group
4846 	 */
4847 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4848 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4849 		err = ENOTACTIVE;
4850 		goto start_fail;
4851 	}
4852 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4853 
4854 	/*
4855 	 * Initialize per-interface caches and lists; if we fail here,
4856 	 * it is most likely due to a lack of resources
4857 	 */
4858 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4859 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4860 		err = ENOMEM;
4861 		goto start_fail;
4862 	}
4863 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4864 
4865 	/*
4866 	 * Allocate send and receive completion queues
4867 	 */
4868 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4869 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4870 		err = ENOMEM;
4871 		goto start_fail;
4872 	}
4873 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4874 
4875 	/*
4876 	 * Setup a UD channel
4877 	 */
4878 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4879 		err = ENOMEM;
4880 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4881 		goto start_fail;
4882 	}
4883 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4884 
4885 	/*
4886 	 * Allocate and initialize the tx buffer list
4887 	 */
4888 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4889 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4890 		err = ENOMEM;
4891 		goto start_fail;
4892 	}
4893 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4894 
4895 	/*
4896 	 * Create the send cq handler here
4897 	 */
4898 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4899 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4900 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4901 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4902 		    "failed, ret=%d", ret);
4903 		err = EINVAL;
4904 		goto start_fail;
4905 	}
4906 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4907 
4908 	/*
4909 	 * Allocate and initialize the rx buffer list
4910 	 */
4911 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4912 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4913 		err = ENOMEM;
4914 		goto start_fail;
4915 	}
4916 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4917 
4918 	/*
4919 	 * Join IPoIB broadcast group
4920 	 */
4921 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4922 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4923 		err = ENOTACTIVE;
4924 		goto start_fail;
4925 	}
4926 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4927 
4928 	/*
4929 	 * Create the async thread; thread_create never fails.
4930 	 */
4931 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4932 	    TS_RUN, minclsyspri);
4933 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4934 	state->id_async_thrid = kht->t_did;
4935 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4936 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4937 
4938 	/*
4939 	 * When we did mac_register() in ibd_attach(), we didn't register
4940 	 * the real macaddr and we didn't have the true port mtu. Now that
4941 	 * we're almost ready, set the local mac address and broadcast
4942 	 * addresses and update gldv3 about the real values of these
4943 	 * parameters.
4944 	 */
4945 	if (state->id_enable_rc) {
4946 		ibd_h2n_mac(&state->id_macaddr,
4947 		    IBD_MAC_ADDR_RC + state->id_qpnum,
4948 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4949 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
4950 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4951 	} else {
4952 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4953 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4954 	}
4955 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4956 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4957 
4958 	if (!state->id_enable_rc) {
4959 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
4960 		    - IPOIB_HDRSIZE);
4961 	}
4962 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4963 
4964 	/*
4965 	 * Setup the receive cq handler
4966 	 */
4967 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4968 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4969 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4970 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4971 		    "failed, ret=%d", ret);
4972 		err = EINVAL;
4973 		goto start_fail;
4974 	}
4975 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4976 
4977 	/*
4978 	 * Setup the subnet notices handler after we've initialized the acache/
4979 	 * mcache and started the async thread, both of which are required for
4980 	 * the trap handler to function properly.
4981 	 *
4982 	 * Now that the async thread has been started (and we've already done
4983 	 * a mac_register() during attach so mac_tx_update() can be called
4984 	 * if necessary without any problem), we can enable the trap handler
4985 	 * to queue requests to the async thread.
4986 	 */
4987 	ibt_register_subnet_notices(state->id_ibt_hdl,
4988 	    ibd_snet_notices_handler, state);
4989 	mutex_enter(&state->id_trap_lock);
4990 	state->id_trap_stop = B_FALSE;
4991 	mutex_exit(&state->id_trap_lock);
4992 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4993 
4994 	if (state->id_enable_rc) {
4995 		if (state->rc_enable_srq) {
4996 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
4997 				if (ibd_rc_repost_srq_free_list(state) !=
4998 				    IBT_SUCCESS) {
4999 					err = ENOMEM;
5000 					goto start_fail;
5001 				}
5002 			} else {
5003 				/* Allocate SRQ resource */
5004 				if (ibd_rc_init_srq_list(state) !=
5005 				    IBT_SUCCESS) {
5006 					err = ENOMEM;
5007 					goto start_fail;
5008 				}
5009 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
5010 			}
5011 		}
5012 
5013 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
5014 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
5015 			    "failed");
5016 			err = ENOMEM;
5017 			goto start_fail;
5018 		}
5019 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
5020 
5021 		/* RC: begin to listen only after everything is available */
5022 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
5023 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
5024 			err = EINVAL;
5025 			goto start_fail;
5026 		}
5027 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
5028 	}
5029 
5030 	/*
5031 	 * Indicate link status to GLDv3 and higher layers. By default,
5032 	 * we assume we are in up state (which must have been true at
5033 	 * least at the time the broadcast mcg's were probed); if there
5034 	 * were any up/down transitions till the time we come here, the
5035 	 * async handler will have updated last known state, which we
5036 	 * use to tell GLDv3. The async handler will not send any
5037 	 * notifications to GLDv3 till we reach here in the initialization
5038 	 * sequence.
5039 	 */
5040 	state->id_mac_state |= IBD_DRV_STARTED;
5041 	mac_link_update(state->id_mh, state->id_link_state);
5042 
5043 	return (DDI_SUCCESS);
5044 
5045 start_fail:
5046 	/*
5047 	 * If we ran into a problem during ibd_start() and ran into
5048 	 * some other problem during undoing our partial work, we can't
5049 	 * do anything about it.  Ignore any errors we might get from
5050 	 * ibd_undo_start() and just return the original error we got.
5051 	 */
5052 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
5053 	return (err);
5054 }
5055 
5056 /*
5057  * GLDv3 entry point to stop hardware from receiving packets.
5058  */
5059 /*ARGSUSED*/
5060 static void
5061 ibd_m_stop(void *arg)
5062 {
5063 	ibd_state_t *state = (ibd_state_t *)arg;
5064 
5065 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5066 
5067 	(void) ibd_undo_start(state, state->id_link_state);
5068 
5069 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5070 }
5071 
5072 /*
5073  * GLDv3 entry point to modify device's mac address. We do not
5074  * allow address modifications.
5075  */
5076 static int
5077 ibd_m_unicst(void *arg, const uint8_t *macaddr)
5078 {
5079 	ibd_state_t *state = arg;
5080 
5081 	/*
5082 	 * Don't bother even comparing the macaddr if we haven't
5083 	 * completed ibd_m_start().
5084 	 */
5085 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5086 		return (0);
5087 
5088 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
5089 		return (0);
5090 	else
5091 		return (EINVAL);
5092 }
5093 
5094 /*
5095  * The blocking part of the IBA join/leave operations are done out
5096  * of here on the async thread.
5097  */
5098 static void
5099 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
5100 {
5101 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
5102 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
5103 
5104 	if (op == IBD_ASYNC_JOIN) {
5105 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
5106 			ibd_print_warn(state, "Join multicast group failed :"
5107 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5108 		}
5109 	} else {
5110 		/*
5111 		 * Here, we must search for the proper mcg_info and
5112 		 * use that to leave the group.
5113 		 */
5114 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
5115 	}
5116 }
5117 
5118 /*
5119  * GLDv3 entry point for multicast enable/disable requests.
5120  * This function queues the operation to the async thread and
5121  * return success for a valid multicast address.
5122  */
5123 static int
5124 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
5125 {
5126 	ibd_state_t *state = (ibd_state_t *)arg;
5127 	ipoib_mac_t maddr, *mcast;
5128 	ib_gid_t mgid;
5129 	ibd_req_t *req;
5130 
5131 	/*
5132 	 * If we haven't completed ibd_m_start(), async thread wouldn't
5133 	 * have been started and id_bcaddr wouldn't be set, so there's
5134 	 * no point in continuing.
5135 	 */
5136 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5137 		return (0);
5138 
5139 	/*
5140 	 * The incoming multicast address might not be aligned properly
5141 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
5142 	 * it to look like one though, to get the offsets of the mc gid,
5143 	 * since we know we are not going to dereference any values with
5144 	 * the ipoib_mac_t pointer.
5145 	 */
5146 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
5147 	mcast = &maddr;
5148 
5149 	/*
5150 	 * Check validity of MCG address. We could additionally check
5151 	 * that a enable/disable is not being issued on the "broadcast"
5152 	 * mcg, but since this operation is only invokable by privileged
5153 	 * programs anyway, we allow the flexibility to those dlpi apps.
5154 	 * Note that we do not validate the "scope" of the IBA mcg.
5155 	 */
5156 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
5157 		return (EINVAL);
5158 
5159 	/*
5160 	 * fill in multicast pkey and scope
5161 	 */
5162 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
5163 
5164 	/*
5165 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
5166 	 * nothing (i.e. we stay JOINed to the broadcast group done in
5167 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
5168 	 * requires to be joined to broadcast groups at all times.
5169 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
5170 	 * depends on this.
5171 	 */
5172 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5173 		return (0);
5174 
5175 	ibd_n2h_gid(mcast, &mgid);
5176 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5177 	if (req == NULL)
5178 		return (ENOMEM);
5179 
5180 	req->rq_gid = mgid;
5181 
5182 	if (add) {
5183 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
5184 		    mgid.gid_prefix, mgid.gid_guid);
5185 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
5186 	} else {
5187 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
5188 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5189 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
5190 	}
5191 	return (0);
5192 }
5193 
5194 /*
5195  * The blocking part of the IBA promiscuous operations are done
5196  * out of here on the async thread. The dlpireq parameter indicates
5197  * whether this invocation is due to a dlpi request or due to
5198  * a port up/down event.
5199  */
5200 static void
5201 ibd_async_unsetprom(ibd_state_t *state)
5202 {
5203 	ibd_mce_t *mce = list_head(&state->id_mc_non);
5204 	ib_gid_t mgid;
5205 
5206 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
5207 
5208 	while (mce != NULL) {
5209 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
5210 		mce = list_next(&state->id_mc_non, mce);
5211 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
5212 	}
5213 	state->id_prom_op = IBD_OP_NOTSTARTED;
5214 }
5215 
5216 /*
5217  * The blocking part of the IBA promiscuous operations are done
5218  * out of here on the async thread. The dlpireq parameter indicates
5219  * whether this invocation is due to a dlpi request or due to
5220  * a port up/down event.
5221  */
5222 static void
5223 ibd_async_setprom(ibd_state_t *state)
5224 {
5225 	ibt_mcg_attr_t mcg_attr;
5226 	ibt_mcg_info_t *mcg_info;
5227 	ib_gid_t mgid;
5228 	uint_t numg;
5229 	int i;
5230 	char ret = IBD_OP_COMPLETED;
5231 
5232 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
5233 
5234 	/*
5235 	 * Obtain all active MC groups on the IB fabric with
5236 	 * specified criteria (scope + Pkey + Qkey + mtu).
5237 	 */
5238 	bzero(&mcg_attr, sizeof (mcg_attr));
5239 	mcg_attr.mc_pkey = state->id_pkey;
5240 	mcg_attr.mc_scope = state->id_scope;
5241 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
5242 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
5243 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
5244 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
5245 	    IBT_SUCCESS) {
5246 		ibd_print_warn(state, "Could not get list of IBA multicast "
5247 		    "groups");
5248 		ret = IBD_OP_ERRORED;
5249 		goto done;
5250 	}
5251 
5252 	/*
5253 	 * Iterate over the returned mcg's and join as NonMember
5254 	 * to the IP mcg's.
5255 	 */
5256 	for (i = 0; i < numg; i++) {
5257 		/*
5258 		 * Do a NonMember JOIN on the MC group.
5259 		 */
5260 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
5261 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5262 			ibd_print_warn(state, "IBA promiscuous mode missed "
5263 			    "multicast gid %016llx:%016llx",
5264 			    (u_longlong_t)mgid.gid_prefix,
5265 			    (u_longlong_t)mgid.gid_guid);
5266 	}
5267 
5268 	ibt_free_mcg_info(mcg_info, numg);
5269 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5270 done:
5271 	state->id_prom_op = ret;
5272 }
5273 
5274 /*
5275  * GLDv3 entry point for multicast promiscuous enable/disable requests.
5276  * GLDv3 assumes phys state receives more packets than multi state,
5277  * which is not true for IPoIB. Thus, treat the multi and phys
5278  * promiscuous states the same way to work with GLDv3's assumption.
5279  */
5280 static int
5281 ibd_m_promisc(void *arg, boolean_t on)
5282 {
5283 	ibd_state_t *state = (ibd_state_t *)arg;
5284 	ibd_req_t *req;
5285 
5286 	/*
5287 	 * Async thread wouldn't have been started if we haven't
5288 	 * passed ibd_m_start()
5289 	 */
5290 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5291 		return (0);
5292 
5293 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5294 	if (req == NULL)
5295 		return (ENOMEM);
5296 	if (on) {
5297 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
5298 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
5299 	} else {
5300 		DPRINT(1, "ibd_m_promisc : unset_promisc");
5301 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
5302 	}
5303 
5304 	return (0);
5305 }
5306 
5307 /*
5308  * GLDv3 entry point for gathering statistics.
5309  */
5310 static int
5311 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
5312 {
5313 	ibd_state_t *state = (ibd_state_t *)arg;
5314 
5315 	switch (stat) {
5316 	case MAC_STAT_IFSPEED:
5317 		*val = state->id_link_speed;
5318 		break;
5319 	case MAC_STAT_MULTIRCV:
5320 		*val = state->id_multi_rcv;
5321 		break;
5322 	case MAC_STAT_BRDCSTRCV:
5323 		*val = state->id_brd_rcv;
5324 		break;
5325 	case MAC_STAT_MULTIXMT:
5326 		*val = state->id_multi_xmt;
5327 		break;
5328 	case MAC_STAT_BRDCSTXMT:
5329 		*val = state->id_brd_xmt;
5330 		break;
5331 	case MAC_STAT_RBYTES:
5332 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
5333 		    + state->rc_rcv_copy_byte;
5334 		break;
5335 	case MAC_STAT_IPACKETS:
5336 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
5337 		    + state->rc_rcv_copy_pkt;
5338 		break;
5339 	case MAC_STAT_OBYTES:
5340 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
5341 		break;
5342 	case MAC_STAT_OPACKETS:
5343 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
5344 		    state->rc_xmt_fragmented_pkt +
5345 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
5346 		break;
5347 	case MAC_STAT_OERRORS:
5348 		*val = state->id_ah_error;	/* failed AH translation */
5349 		break;
5350 	case MAC_STAT_IERRORS:
5351 		*val = 0;
5352 		break;
5353 	case MAC_STAT_NOXMTBUF:
5354 		*val = state->id_tx_short + state->rc_swqe_short +
5355 		    state->rc_xmt_buf_short;
5356 		break;
5357 	case MAC_STAT_NORCVBUF:
5358 	default:
5359 		return (ENOTSUP);
5360 	}
5361 
5362 	return (0);
5363 }
5364 
5365 static void
5366 ibd_async_txsched(ibd_state_t *state)
5367 {
5368 	ibd_resume_transmission(state);
5369 }
5370 
5371 static void
5372 ibd_resume_transmission(ibd_state_t *state)
5373 {
5374 	int flag;
5375 	int met_thresh = 0;
5376 	int thresh = 0;
5377 	int ret = -1;
5378 
5379 	mutex_enter(&state->id_sched_lock);
5380 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5381 		mutex_enter(&state->id_tx_list.dl_mutex);
5382 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5383 		met_thresh = state->id_tx_list.dl_cnt +
5384 		    state->id_tx_rel_list.dl_cnt;
5385 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5386 		mutex_exit(&state->id_tx_list.dl_mutex);
5387 		thresh = IBD_FREE_SWQES_THRESH;
5388 		flag = IBD_RSRC_SWQE;
5389 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5390 		ASSERT(state->id_lso != NULL);
5391 		mutex_enter(&state->id_lso_lock);
5392 		met_thresh = state->id_lso->bkt_nfree;
5393 		thresh = IBD_FREE_LSOS_THRESH;
5394 		mutex_exit(&state->id_lso_lock);
5395 		flag = IBD_RSRC_LSOBUF;
5396 		if (met_thresh > thresh)
5397 			state->id_sched_lso_cnt++;
5398 	}
5399 	if (met_thresh > thresh) {
5400 		state->id_sched_needed &= ~flag;
5401 		state->id_sched_cnt++;
5402 		ret = 0;
5403 	}
5404 	mutex_exit(&state->id_sched_lock);
5405 
5406 	if (ret == 0)
5407 		mac_tx_update(state->id_mh);
5408 }
5409 
5410 /*
5411  * Release the send wqe back into free list.
5412  */
5413 static void
5414 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
5415 {
5416 	/*
5417 	 * Add back on Tx list for reuse.
5418 	 */
5419 	ASSERT(tail->swqe_next == NULL);
5420 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5421 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
5422 	tail->swqe_next = state->id_tx_rel_list.dl_head;
5423 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
5424 	state->id_tx_rel_list.dl_cnt += n;
5425 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
5426 }
5427 
5428 /*
5429  * Acquire a send wqe from free list.
5430  * Returns error number and send wqe pointer.
5431  */
5432 static ibd_swqe_t *
5433 ibd_acquire_swqe(ibd_state_t *state)
5434 {
5435 	ibd_swqe_t *wqe;
5436 
5437 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5438 	if (state->id_tx_rel_list.dl_head != NULL) {
5439 		/* transfer id_tx_rel_list to id_tx_list */
5440 		state->id_tx_list.dl_head =
5441 		    state->id_tx_rel_list.dl_head;
5442 		state->id_tx_list.dl_cnt =
5443 		    state->id_tx_rel_list.dl_cnt;
5444 		state->id_tx_list.dl_pending_sends = B_FALSE;
5445 
5446 		/* clear id_tx_rel_list */
5447 		state->id_tx_rel_list.dl_head = NULL;
5448 		state->id_tx_rel_list.dl_cnt = 0;
5449 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5450 
5451 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5452 		state->id_tx_list.dl_cnt -= 1;
5453 		state->id_tx_list.dl_head = wqe->swqe_next;
5454 	} else {	/* no free swqe */
5455 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5456 		state->id_tx_list.dl_pending_sends = B_TRUE;
5457 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5458 		state->id_tx_short++;
5459 		wqe = NULL;
5460 	}
5461 	return (wqe);
5462 }
5463 
5464 static int
5465 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5466     ibt_ud_dest_hdl_t ud_dest)
5467 {
5468 	mblk_t	*nmp;
5469 	int iph_len, tcph_len;
5470 	ibt_wr_lso_t *lso;
5471 	uintptr_t ip_start, tcp_start;
5472 	uint8_t *dst;
5473 	uint_t pending, mblen;
5474 
5475 	/*
5476 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5477 	 * we need to adjust it here for lso.
5478 	 */
5479 	lso = &(node->w_swr.wr.ud_lso);
5480 	lso->lso_ud_dest = ud_dest;
5481 	lso->lso_mss = mss;
5482 
5483 	/*
5484 	 * Calculate the LSO header size and set it in the UD LSO structure.
5485 	 * Note that the only assumption we make is that each of the IPoIB,
5486 	 * IP and TCP headers will be contained in a single mblk fragment;
5487 	 * together, the headers may span multiple mblk fragments.
5488 	 */
5489 	nmp = mp;
5490 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5491 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5492 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5493 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5494 		nmp = nmp->b_cont;
5495 
5496 	}
5497 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5498 
5499 	tcp_start = ip_start + iph_len;
5500 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5501 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5502 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5503 		nmp = nmp->b_cont;
5504 	}
5505 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5506 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5507 
5508 	/*
5509 	 * If the lso header fits entirely within a single mblk fragment,
5510 	 * we'll avoid an additional copy of the lso header here and just
5511 	 * pass the b_rptr of the mblk directly.
5512 	 *
5513 	 * If this isn't true, we'd have to allocate for it explicitly.
5514 	 */
5515 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5516 		lso->lso_hdr = mp->b_rptr;
5517 	} else {
5518 		/* On work completion, remember to free this allocated hdr */
5519 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5520 		if (lso->lso_hdr == NULL) {
5521 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5522 			    "sz = %d", lso->lso_hdr_sz);
5523 			lso->lso_hdr_sz = 0;
5524 			lso->lso_mss = 0;
5525 			return (-1);
5526 		}
5527 	}
5528 
5529 	/*
5530 	 * Copy in the lso header only if we need to
5531 	 */
5532 	if (lso->lso_hdr != mp->b_rptr) {
5533 		dst = lso->lso_hdr;
5534 		pending = lso->lso_hdr_sz;
5535 
5536 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5537 			mblen = MBLKL(nmp);
5538 			if (pending > mblen) {
5539 				bcopy(nmp->b_rptr, dst, mblen);
5540 				dst += mblen;
5541 				pending -= mblen;
5542 			} else {
5543 				bcopy(nmp->b_rptr, dst, pending);
5544 				break;
5545 			}
5546 		}
5547 	}
5548 
5549 	return (0);
5550 }
5551 
5552 static void
5553 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5554 {
5555 	ibt_wr_lso_t *lso;
5556 
5557 	if ((!node) || (!mp))
5558 		return;
5559 
5560 	/*
5561 	 * Free any header space that we might've allocated if we
5562 	 * did an LSO
5563 	 */
5564 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5565 		lso = &(node->w_swr.wr.ud_lso);
5566 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5567 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5568 			lso->lso_hdr = NULL;
5569 			lso->lso_hdr_sz = 0;
5570 		}
5571 	}
5572 }
5573 
5574 static void
5575 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5576 {
5577 	uint_t		i;
5578 	uint_t		num_posted;
5579 	uint_t		n_wrs;
5580 	ibt_status_t	ibt_status;
5581 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
5582 	ibd_swqe_t	*tx_head, *elem;
5583 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
5584 
5585 	/* post the one request, then check for more */
5586 	ibt_status = ibt_post_send(state->id_chnl_hdl,
5587 	    &node->w_swr, 1, NULL);
5588 	if (ibt_status != IBT_SUCCESS) {
5589 		ibd_print_warn(state, "ibd_post_send: "
5590 		    "posting one wr failed: ret=%d", ibt_status);
5591 		ibd_tx_cleanup(state, node);
5592 	}
5593 
5594 	tx_head = NULL;
5595 	for (;;) {
5596 		if (tx_head == NULL) {
5597 			mutex_enter(&state->id_txpost_lock);
5598 			tx_head = state->id_tx_head;
5599 			if (tx_head == NULL) {
5600 				state->id_tx_busy = 0;
5601 				mutex_exit(&state->id_txpost_lock);
5602 				return;
5603 			}
5604 			state->id_tx_head = NULL;
5605 			mutex_exit(&state->id_txpost_lock);
5606 		}
5607 
5608 		/*
5609 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
5610 		 * at a time if possible, and keep posting them.
5611 		 */
5612 		for (n_wrs = 0, elem = tx_head;
5613 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
5614 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5615 			nodes[n_wrs] = elem;
5616 			wrs[n_wrs] = elem->w_swr;
5617 		}
5618 		tx_head = elem;
5619 
5620 		ASSERT(n_wrs != 0);
5621 
5622 		/*
5623 		 * If posting fails for some reason, we'll never receive
5624 		 * completion intimation, so we'll need to cleanup. But
5625 		 * we need to make sure we don't clean up nodes whose
5626 		 * wrs have been successfully posted. We assume that the
5627 		 * hca driver returns on the first failure to post and
5628 		 * therefore the first 'num_posted' entries don't need
5629 		 * cleanup here.
5630 		 */
5631 		num_posted = 0;
5632 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5633 		    wrs, n_wrs, &num_posted);
5634 		if (ibt_status != IBT_SUCCESS) {
5635 			ibd_print_warn(state, "ibd_post_send: "
5636 			    "posting multiple wrs failed: "
5637 			    "requested=%d, done=%d, ret=%d",
5638 			    n_wrs, num_posted, ibt_status);
5639 
5640 			for (i = num_posted; i < n_wrs; i++)
5641 				ibd_tx_cleanup(state, nodes[i]);
5642 		}
5643 	}
5644 }
5645 
5646 static int
5647 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5648     uint_t lsohdr_sz)
5649 {
5650 	ibt_wr_ds_t *sgl;
5651 	ibt_status_t ibt_status;
5652 	mblk_t *nmp;
5653 	mblk_t *data_mp;
5654 	uchar_t *bufp;
5655 	size_t blksize;
5656 	size_t skip;
5657 	size_t avail;
5658 	uint_t pktsize;
5659 	uint_t frag_len;
5660 	uint_t pending_hdr;
5661 	int nmblks;
5662 	int i;
5663 
5664 	/*
5665 	 * Let's skip ahead to the data if this is LSO
5666 	 */
5667 	data_mp = mp;
5668 	pending_hdr = 0;
5669 	if (lsohdr_sz) {
5670 		pending_hdr = lsohdr_sz;
5671 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5672 			frag_len = nmp->b_wptr - nmp->b_rptr;
5673 			if (frag_len > pending_hdr)
5674 				break;
5675 			pending_hdr -= frag_len;
5676 		}
5677 		data_mp = nmp;	/* start of data past lso header */
5678 		ASSERT(data_mp != NULL);
5679 	}
5680 
5681 	/*
5682 	 * Calculate the size of message data and number of msg blocks
5683 	 */
5684 	pktsize = 0;
5685 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5686 	    nmp = nmp->b_cont, nmblks++) {
5687 		pktsize += MBLKL(nmp);
5688 	}
5689 	pktsize -= pending_hdr;
5690 
5691 	/*
5692 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5693 	 * "copy-threshold", and if the number of mp fragments is less than
5694 	 * the maximum acceptable.
5695 	 */
5696 	if ((state->id_hca_res_lkey_capab) &&
5697 	    (pktsize > IBD_TX_COPY_THRESH) &&
5698 	    (nmblks < state->id_max_sqseg_hiwm)) {
5699 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5700 		ibt_iov_attr_t iov_attr;
5701 
5702 		iov_attr.iov_as = NULL;
5703 		iov_attr.iov = iov_arr;
5704 		iov_attr.iov_buf = NULL;
5705 		iov_attr.iov_list_len = nmblks;
5706 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5707 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5708 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5709 
5710 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5711 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5712 			iov_arr[i].iov_len = MBLKL(nmp);
5713 			if (i == 0) {
5714 				iov_arr[i].iov_addr += pending_hdr;
5715 				iov_arr[i].iov_len -= pending_hdr;
5716 			}
5717 		}
5718 
5719 		node->w_buftype = IBD_WQE_MAPPED;
5720 		node->w_swr.wr_sgl = node->w_sgl;
5721 
5722 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5723 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5724 		if (ibt_status != IBT_SUCCESS) {
5725 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5726 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5727 			goto ibd_copy_path;
5728 		}
5729 
5730 		return (0);
5731 	}
5732 
5733 ibd_copy_path:
5734 	if (pktsize <= state->id_tx_buf_sz) {
5735 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5736 		node->w_swr.wr_nds = 1;
5737 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5738 		node->w_buftype = IBD_WQE_TXBUF;
5739 
5740 		/*
5741 		 * Even though this is the copy path for transfers less than
5742 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5743 		 * is possible the first data mblk fragment (data_mp) still
5744 		 * contains part of the LSO header that we need to skip.
5745 		 */
5746 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5747 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5748 			blksize = MBLKL(nmp) - pending_hdr;
5749 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5750 			bufp += blksize;
5751 			pending_hdr = 0;
5752 		}
5753 
5754 		return (0);
5755 	}
5756 
5757 	/*
5758 	 * Copy path for transfers greater than id_tx_buf_sz
5759 	 */
5760 	node->w_swr.wr_sgl = node->w_sgl;
5761 	if (ibd_acquire_lsobufs(state, pktsize,
5762 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5763 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5764 		return (-1);
5765 	}
5766 	node->w_buftype = IBD_WQE_LSOBUF;
5767 
5768 	/*
5769 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5770 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5771 	 * need to skip part of the LSO header in the first fragment
5772 	 * as before.
5773 	 */
5774 	nmp = data_mp;
5775 	skip = pending_hdr;
5776 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5777 		sgl = node->w_swr.wr_sgl + i;
5778 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5779 		avail = IBD_LSO_BUFSZ;
5780 		while (nmp && avail) {
5781 			blksize = MBLKL(nmp) - skip;
5782 			if (blksize > avail) {
5783 				bcopy(nmp->b_rptr + skip, bufp, avail);
5784 				skip += avail;
5785 				avail = 0;
5786 			} else {
5787 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5788 				skip = 0;
5789 				avail -= blksize;
5790 				bufp += blksize;
5791 				nmp = nmp->b_cont;
5792 			}
5793 		}
5794 	}
5795 
5796 	return (0);
5797 }
5798 
5799 /*
5800  * Schedule a completion queue polling to reap the resource we're
5801  * short on.  If we implement the change to reap tx completions
5802  * in a separate thread, we'll need to wake up that thread here.
5803  */
5804 static int
5805 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5806 {
5807 	ibd_req_t *req;
5808 
5809 	mutex_enter(&state->id_sched_lock);
5810 	state->id_sched_needed |= resource_type;
5811 	mutex_exit(&state->id_sched_lock);
5812 
5813 	/*
5814 	 * If we are asked to queue a work entry, we need to do it
5815 	 */
5816 	if (q_flag) {
5817 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5818 		if (req == NULL)
5819 			return (-1);
5820 
5821 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5822 	}
5823 
5824 	return (0);
5825 }
5826 
5827 /*
5828  * The passed in packet has this format:
5829  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5830  */
5831 static boolean_t
5832 ibd_send(ibd_state_t *state, mblk_t *mp)
5833 {
5834 	ibd_ace_t *ace;
5835 	ibd_swqe_t *node;
5836 	ipoib_mac_t *dest;
5837 	ib_header_info_t *ipibp;
5838 	ip6_t *ip6h;
5839 	uint_t pktsize;
5840 	uint32_t mss;
5841 	uint32_t hckflags;
5842 	uint32_t lsoflags = 0;
5843 	uint_t lsohdr_sz = 0;
5844 	int ret, len;
5845 	boolean_t dofree = B_FALSE;
5846 	boolean_t rc;
5847 	/* if (rc_chan == NULL) send by UD; else send by RC; */
5848 	ibd_rc_chan_t *rc_chan;
5849 	int nmblks;
5850 	mblk_t *nmp;
5851 
5852 	/*
5853 	 * If we aren't done with the device initialization and start,
5854 	 * we shouldn't be here.
5855 	 */
5856 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5857 		return (B_FALSE);
5858 
5859 	/*
5860 	 * Obtain an address handle for the destination.
5861 	 */
5862 	ipibp = (ib_header_info_t *)mp->b_rptr;
5863 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5864 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5865 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5866 
5867 	rc_chan = NULL;
5868 	ace = ibd_acache_lookup(state, dest, &ret, 1);
5869 	if (state->id_enable_rc && (ace != NULL) &&
5870 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
5871 		if (ace->ac_chan == NULL) {
5872 			state->rc_null_conn++;
5873 		} else {
5874 			if (ace->ac_chan->chan_state ==
5875 			    IBD_RC_STATE_ACT_ESTAB) {
5876 				rc_chan = ace->ac_chan;
5877 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
5878 				node = WQE_TO_SWQE(
5879 				    rc_chan->tx_wqe_list.dl_head);
5880 				if (node != NULL) {
5881 					rc_chan->tx_wqe_list.dl_cnt -= 1;
5882 					rc_chan->tx_wqe_list.dl_head =
5883 					    node->swqe_next;
5884 				} else {
5885 					node = ibd_rc_acquire_swqes(rc_chan);
5886 				}
5887 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
5888 
5889 				if (node == NULL) {
5890 					state->rc_swqe_short++;
5891 					mutex_enter(&state->id_sched_lock);
5892 					state->id_sched_needed |=
5893 					    IBD_RSRC_RC_SWQE;
5894 					mutex_exit(&state->id_sched_lock);
5895 					ibd_dec_ref_ace(state, ace);
5896 					return (B_FALSE);
5897 				}
5898 			} else {
5899 				state->rc_no_estab_conn++;
5900 			}
5901 		}
5902 	}
5903 
5904 	if (rc_chan == NULL) {
5905 		mutex_enter(&state->id_tx_list.dl_mutex);
5906 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
5907 		if (node != NULL) {
5908 			state->id_tx_list.dl_cnt -= 1;
5909 			state->id_tx_list.dl_head = node->swqe_next;
5910 		} else {
5911 			node = ibd_acquire_swqe(state);
5912 		}
5913 		mutex_exit(&state->id_tx_list.dl_mutex);
5914 		if (node == NULL) {
5915 			/*
5916 			 * If we don't have an swqe available, schedule a
5917 			 * transmit completion queue cleanup and hold off on
5918 			 * sending more packets until we have some free swqes
5919 			 */
5920 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
5921 				if (ace != NULL) {
5922 					ibd_dec_ref_ace(state, ace);
5923 				}
5924 				return (B_FALSE);
5925 			}
5926 
5927 			/*
5928 			 * If a poll cannot be scheduled, we have no choice but
5929 			 * to drop this packet
5930 			 */
5931 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5932 			if (ace != NULL) {
5933 				ibd_dec_ref_ace(state, ace);
5934 			}
5935 			return (B_TRUE);
5936 		}
5937 	}
5938 
5939 	/*
5940 	 * Initialize the commonly used fields in swqe to NULL to protect
5941 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5942 	 * failure.
5943 	 */
5944 	node->swqe_im_mblk = NULL;
5945 	node->w_swr.wr_nds = 0;
5946 	node->w_swr.wr_sgl = NULL;
5947 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5948 
5949 	/*
5950 	 * Calculate the size of message data and number of msg blocks
5951 	 */
5952 	pktsize = 0;
5953 	for (nmblks = 0, nmp = mp; nmp != NULL;
5954 	    nmp = nmp->b_cont, nmblks++) {
5955 		pktsize += MBLKL(nmp);
5956 	}
5957 
5958 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5959 		atomic_inc_64(&state->id_brd_xmt);
5960 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5961 		atomic_inc_64(&state->id_multi_xmt);
5962 
5963 	if (ace != NULL) {
5964 		node->w_ahandle = ace;
5965 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5966 	} else {
5967 		DPRINT(5,
5968 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5969 		    ((ret == EFAULT) ? "failed" : "queued"),
5970 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5971 		    htonl(dest->ipoib_gidpref[1]),
5972 		    htonl(dest->ipoib_gidsuff[0]),
5973 		    htonl(dest->ipoib_gidsuff[1]));
5974 		state->rc_ace_not_found++;
5975 		node->w_ahandle = NULL;
5976 
5977 		/*
5978 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5979 		 * can not find a path for the specific dest address. We
5980 		 * should get rid of this kind of packet.  We also should get
5981 		 * rid of the packet if we cannot schedule a poll via the
5982 		 * async thread.  For the normal case, ibd will return the
5983 		 * packet to upper layer and wait for AH creating.
5984 		 *
5985 		 * Note that we always queue a work slot entry for the async
5986 		 * thread when we fail AH lookup (even in intr mode); this is
5987 		 * due to the convoluted way the code currently looks for AH.
5988 		 */
5989 		if (ret == EFAULT) {
5990 			dofree = B_TRUE;
5991 			rc = B_TRUE;
5992 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5993 			dofree = B_TRUE;
5994 			rc = B_TRUE;
5995 		} else {
5996 			dofree = B_FALSE;
5997 			rc = B_FALSE;
5998 		}
5999 		goto ibd_send_fail;
6000 	}
6001 
6002 	/*
6003 	 * For ND6 packets, padding is at the front of the source lladdr.
6004 	 * Insert the padding at front.
6005 	 */
6006 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
6007 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
6008 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6009 			    sizeof (ib_header_info_t))) {
6010 				DPRINT(10, "ibd_send: pullupmsg failure ");
6011 				dofree = B_TRUE;
6012 				rc = B_TRUE;
6013 				goto ibd_send_fail;
6014 			}
6015 			ipibp = (ib_header_info_t *)mp->b_rptr;
6016 		}
6017 		ip6h = (ip6_t *)((uchar_t *)ipibp +
6018 		    sizeof (ib_header_info_t));
6019 		len = ntohs(ip6h->ip6_plen);
6020 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6021 			mblk_t	*pad;
6022 
6023 			pad = allocb(4, 0);
6024 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
6025 			linkb(mp, pad);
6026 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
6027 			    IPV6_HDR_LEN + len + 4) {
6028 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
6029 				    IPV6_HDR_LEN + len + 4)) {
6030 					DPRINT(10, "ibd_send: pullupmsg "
6031 					    "failure ");
6032 					dofree = B_TRUE;
6033 					rc = B_TRUE;
6034 					goto ibd_send_fail;
6035 				}
6036 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6037 				    sizeof (ib_header_info_t));
6038 			}
6039 
6040 			/* LINTED: E_CONSTANT_CONDITION */
6041 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
6042 		}
6043 	}
6044 
6045 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
6046 	mp->b_rptr += sizeof (ib_addrs_t);
6047 	pktsize -= sizeof (ib_addrs_t);
6048 
6049 	if (rc_chan) {	/* send in RC mode */
6050 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6051 		ibt_iov_attr_t iov_attr;
6052 		uint_t		i;
6053 		size_t	blksize;
6054 		uchar_t *bufp;
6055 		ibd_rc_tx_largebuf_t *lbufp;
6056 
6057 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
6058 
6059 		/*
6060 		 * Upper layer does Tx checksum, we don't need do any
6061 		 * checksum here.
6062 		 */
6063 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
6064 
6065 		/*
6066 		 * We only do ibt_map_mem_iov() if the pktsize is above
6067 		 * the "copy-threshold", and if the number of mp
6068 		 * fragments is less than the maximum acceptable.
6069 		 */
6070 		if (pktsize <= ibd_rc_tx_copy_thresh) {
6071 			atomic_inc_64(&state->rc_xmt_small_pkt);
6072 			/*
6073 			 * Only process unicast packet in Reliable Connected
6074 			 * mode.
6075 			 */
6076 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6077 			node->w_swr.wr_nds = 1;
6078 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6079 			node->w_buftype = IBD_WQE_TXBUF;
6080 
6081 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6082 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6083 				blksize = MBLKL(nmp);
6084 				bcopy(nmp->b_rptr, bufp, blksize);
6085 				bufp += blksize;
6086 			}
6087 			freemsg(mp);
6088 			ASSERT(node->swqe_im_mblk == NULL);
6089 		} else {
6090 			if ((state->rc_enable_iov_map) &&
6091 			    (nmblks < state->rc_max_sqseg_hiwm)) {
6092 
6093 				/* do ibt_map_mem_iov() */
6094 				iov_attr.iov_as = NULL;
6095 				iov_attr.iov = iov_arr;
6096 				iov_attr.iov_buf = NULL;
6097 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
6098 				iov_attr.iov_lso_hdr_sz = 0;
6099 				iov_attr.iov_flags = IBT_IOV_SLEEP;
6100 
6101 				i = 0;
6102 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6103 					iov_arr[i].iov_len = MBLKL(nmp);
6104 					if (iov_arr[i].iov_len != 0) {
6105 						iov_arr[i].iov_addr = (caddr_t)
6106 						    (void *)nmp->b_rptr;
6107 						i++;
6108 					}
6109 				}
6110 				iov_attr.iov_list_len = i;
6111 				node->w_swr.wr_sgl = node->w_sgl;
6112 
6113 				ret = ibt_map_mem_iov(state->id_hca_hdl,
6114 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
6115 				    &node->w_mi_hdl);
6116 				if (ret != IBT_SUCCESS) {
6117 					atomic_inc_64(
6118 					    &state->rc_xmt_map_fail_pkt);
6119 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
6120 					    ") failed, nmblks=%d, real_nmblks"
6121 					    "=%d, ret=0x%x", nmblks, i, ret);
6122 					goto ibd_rc_large_copy;
6123 				}
6124 
6125 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
6126 				node->w_buftype = IBD_WQE_MAPPED;
6127 				node->swqe_im_mblk = mp;
6128 			} else {
6129 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
6130 ibd_rc_large_copy:
6131 				mutex_enter(&state->rc_tx_large_bufs_lock);
6132 				if (state->rc_tx_largebuf_nfree == 0) {
6133 					state->rc_xmt_buf_short++;
6134 					mutex_exit
6135 					    (&state->rc_tx_large_bufs_lock);
6136 					mutex_enter(&state->id_sched_lock);
6137 					state->id_sched_needed |=
6138 					    IBD_RSRC_RC_TX_LARGEBUF;
6139 					mutex_exit(&state->id_sched_lock);
6140 					dofree = B_FALSE;
6141 					rc = B_FALSE;
6142 					/*
6143 					 * If we don't have Tx large bufs,
6144 					 * return failure. node->w_buftype
6145 					 * should not be IBD_WQE_RC_COPYBUF,
6146 					 * otherwise it will cause problem
6147 					 * in ibd_rc_tx_cleanup()
6148 					 */
6149 					node->w_buftype = IBD_WQE_TXBUF;
6150 					goto ibd_send_fail;
6151 				}
6152 
6153 				lbufp = state->rc_tx_largebuf_free_head;
6154 				ASSERT(lbufp->lb_buf != NULL);
6155 				state->rc_tx_largebuf_free_head =
6156 				    lbufp->lb_next;
6157 				lbufp->lb_next = NULL;
6158 				/* Update nfree count */
6159 				state->rc_tx_largebuf_nfree --;
6160 				mutex_exit(&state->rc_tx_large_bufs_lock);
6161 				bufp = lbufp->lb_buf;
6162 				node->w_sgl[0].ds_va =
6163 				    (ib_vaddr_t)(uintptr_t)bufp;
6164 				node->w_sgl[0].ds_key =
6165 				    state->rc_tx_mr_desc.md_lkey;
6166 				node->w_sgl[0].ds_len = pktsize;
6167 				node->w_swr.wr_sgl = node->w_sgl;
6168 				node->w_swr.wr_nds = 1;
6169 				node->w_buftype = IBD_WQE_RC_COPYBUF;
6170 				node->w_rc_tx_largebuf = lbufp;
6171 
6172 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6173 					blksize = MBLKL(nmp);
6174 					if (blksize != 0) {
6175 						bcopy(nmp->b_rptr, bufp,
6176 						    blksize);
6177 						bufp += blksize;
6178 					}
6179 				}
6180 				freemsg(mp);
6181 				ASSERT(node->swqe_im_mblk == NULL);
6182 			}
6183 		}
6184 
6185 		node->swqe_next = NULL;
6186 		mutex_enter(&rc_chan->tx_post_lock);
6187 		if (rc_chan->tx_busy) {
6188 			if (rc_chan->tx_head) {
6189 				rc_chan->tx_tail->swqe_next =
6190 				    SWQE_TO_WQE(node);
6191 			} else {
6192 				rc_chan->tx_head = node;
6193 			}
6194 			rc_chan->tx_tail = node;
6195 			mutex_exit(&rc_chan->tx_post_lock);
6196 		} else {
6197 			rc_chan->tx_busy = 1;
6198 			mutex_exit(&rc_chan->tx_post_lock);
6199 			ibd_rc_post_send(rc_chan, node);
6200 		}
6201 
6202 		return (B_TRUE);
6203 	} /* send by RC */
6204 
6205 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
6206 		/*
6207 		 * Too long pktsize. The packet size from GLD should <=
6208 		 * state->id_mtu + sizeof (ib_addrs_t)
6209 		 */
6210 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
6211 			ibd_req_t *req;
6212 
6213 			mutex_enter(&ace->tx_too_big_mutex);
6214 			if (ace->tx_too_big_ongoing) {
6215 				mutex_exit(&ace->tx_too_big_mutex);
6216 				state->rc_xmt_reenter_too_long_pkt++;
6217 				dofree = B_TRUE;
6218 			} else {
6219 				ace->tx_too_big_ongoing = B_TRUE;
6220 				mutex_exit(&ace->tx_too_big_mutex);
6221 				state->rc_xmt_icmp_too_long_pkt++;
6222 
6223 				req = kmem_cache_alloc(state->id_req_kmc,
6224 				    KM_NOSLEEP);
6225 				if (req == NULL) {
6226 					ibd_print_warn(state, "ibd_send: alloc "
6227 					    "ibd_req_t fail");
6228 					/* Drop it. */
6229 					dofree = B_TRUE;
6230 				} else {
6231 					req->rq_ptr = mp;
6232 					req->rq_ptr2 = ace;
6233 					ibd_queue_work_slot(state, req,
6234 					    IBD_ASYNC_RC_TOO_BIG);
6235 					dofree = B_FALSE;
6236 				}
6237 			}
6238 		} else {
6239 			ibd_print_warn(state, "Reliable Connected mode is on. "
6240 			    "Multicast packet length %d > %d is too long to "
6241 			    "send packet (%d > %d), drop it",
6242 			    pktsize, state->id_mtu);
6243 			state->rc_xmt_drop_too_long_pkt++;
6244 			/* Drop it. */
6245 			dofree = B_TRUE;
6246 		}
6247 		rc = B_TRUE;
6248 		goto ibd_send_fail;
6249 	}
6250 
6251 	atomic_add_64(&state->id_xmt_bytes, pktsize);
6252 	atomic_inc_64(&state->id_xmt_pkt);
6253 
6254 	/*
6255 	 * Do LSO and checksum related work here.  For LSO send, adjust the
6256 	 * ud destination, the opcode and the LSO header information to the
6257 	 * work request.
6258 	 */
6259 	lso_info_get(mp, &mss, &lsoflags);
6260 	if ((lsoflags & HW_LSO) != HW_LSO) {
6261 		node->w_swr.wr_opcode = IBT_WRC_SEND;
6262 		lsohdr_sz = 0;
6263 	} else {
6264 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
6265 			/*
6266 			 * The routine can only fail if there's no memory; we
6267 			 * can only drop the packet if this happens
6268 			 */
6269 			ibd_print_warn(state,
6270 			    "ibd_send: no memory, lso posting failed");
6271 			dofree = B_TRUE;
6272 			rc = B_TRUE;
6273 			goto ibd_send_fail;
6274 		}
6275 
6276 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
6277 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
6278 	}
6279 
6280 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
6281 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
6282 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
6283 	else
6284 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
6285 
6286 	/*
6287 	 * Prepare the sgl for posting; the routine can only fail if there's
6288 	 * no lso buf available for posting. If this is the case, we should
6289 	 * probably resched for lso bufs to become available and then try again.
6290 	 */
6291 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
6292 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
6293 			dofree = B_TRUE;
6294 			rc = B_TRUE;
6295 		} else {
6296 			dofree = B_FALSE;
6297 			rc = B_FALSE;
6298 		}
6299 		goto ibd_send_fail;
6300 	}
6301 	node->swqe_im_mblk = mp;
6302 
6303 	/*
6304 	 * Queue the wqe to hardware; since we can now simply queue a
6305 	 * post instead of doing it serially, we cannot assume anything
6306 	 * about the 'node' after ibd_post_send() returns.
6307 	 */
6308 	node->swqe_next = NULL;
6309 
6310 	mutex_enter(&state->id_txpost_lock);
6311 	if (state->id_tx_busy) {
6312 		if (state->id_tx_head) {
6313 			state->id_tx_tail->swqe_next =
6314 			    SWQE_TO_WQE(node);
6315 		} else {
6316 			state->id_tx_head = node;
6317 		}
6318 		state->id_tx_tail = node;
6319 		mutex_exit(&state->id_txpost_lock);
6320 	} else {
6321 		state->id_tx_busy = 1;
6322 		mutex_exit(&state->id_txpost_lock);
6323 		ibd_post_send(state, node);
6324 	}
6325 
6326 	return (B_TRUE);
6327 
6328 ibd_send_fail:
6329 	if (node && mp)
6330 		ibd_free_lsohdr(node, mp);
6331 
6332 	if (dofree)
6333 		freemsg(mp);
6334 
6335 	if (node != NULL) {
6336 		if (rc_chan) {
6337 			ibd_rc_tx_cleanup(node);
6338 		} else {
6339 			ibd_tx_cleanup(state, node);
6340 		}
6341 	}
6342 
6343 	return (rc);
6344 }
6345 
6346 /*
6347  * GLDv3 entry point for transmitting datagram.
6348  */
6349 static mblk_t *
6350 ibd_m_tx(void *arg, mblk_t *mp)
6351 {
6352 	ibd_state_t *state = (ibd_state_t *)arg;
6353 	mblk_t *next;
6354 
6355 	if (state->id_link_state != LINK_STATE_UP) {
6356 		freemsgchain(mp);
6357 		mp = NULL;
6358 	}
6359 
6360 	while (mp != NULL) {
6361 		next = mp->b_next;
6362 		mp->b_next = NULL;
6363 		if (ibd_send(state, mp) == B_FALSE) {
6364 			/* Send fail */
6365 			mp->b_next = next;
6366 			break;
6367 		}
6368 		mp = next;
6369 	}
6370 
6371 	return (mp);
6372 }
6373 
6374 /*
6375  * this handles Tx and Rx completions. With separate CQs, this handles
6376  * only Rx completions.
6377  */
6378 static uint_t
6379 ibd_intr(caddr_t arg)
6380 {
6381 	ibd_state_t *state = (ibd_state_t *)arg;
6382 
6383 	ibd_poll_rcq(state, state->id_rcq_hdl);
6384 
6385 	return (DDI_INTR_CLAIMED);
6386 }
6387 
6388 /*
6389  * Poll and fully drain the send cq
6390  */
6391 static void
6392 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6393 {
6394 	ibt_wc_t *wcs = state->id_txwcs;
6395 	uint_t numwcs = state->id_txwcs_size;
6396 	ibd_wqe_t *wqe;
6397 	ibd_swqe_t *head, *tail;
6398 	ibt_wc_t *wc;
6399 	uint_t num_polled;
6400 	int i;
6401 
6402 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
6403 		head = tail = NULL;
6404 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
6405 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
6406 			if (wc->wc_status != IBT_WC_SUCCESS) {
6407 				/*
6408 				 * Channel being torn down.
6409 				 */
6410 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
6411 					DPRINT(5, "ibd_drain_scq: flush error");
6412 					DPRINT(10, "ibd_drain_scq: Bad "
6413 					    "status %d", wc->wc_status);
6414 				} else {
6415 					DPRINT(10, "ibd_drain_scq: "
6416 					    "unexpected wc_status %d",
6417 					    wc->wc_status);
6418 				}
6419 				/*
6420 				 * Fallthrough to invoke the Tx handler to
6421 				 * release held resources, e.g., AH refcount.
6422 				 */
6423 			}
6424 			/*
6425 			 * Add this swqe to the list to be cleaned up.
6426 			 */
6427 			if (head)
6428 				tail->swqe_next = wqe;
6429 			else
6430 				head = WQE_TO_SWQE(wqe);
6431 			tail = WQE_TO_SWQE(wqe);
6432 		}
6433 		tail->swqe_next = NULL;
6434 		ibd_tx_cleanup_list(state, head, tail);
6435 
6436 		/*
6437 		 * Resume any blocked transmissions if possible
6438 		 */
6439 		ibd_resume_transmission(state);
6440 	}
6441 }
6442 
6443 /*
6444  * Poll and fully drain the receive cq
6445  */
6446 static void
6447 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6448 {
6449 	ibt_wc_t *wcs = state->id_rxwcs;
6450 	uint_t numwcs = state->id_rxwcs_size;
6451 	ibd_rwqe_t *rwqe;
6452 	ibt_wc_t *wc;
6453 	uint_t num_polled;
6454 	int i;
6455 	mblk_t *head, *tail, *mp;
6456 
6457 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
6458 		head = tail = NULL;
6459 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
6460 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
6461 			if (wc->wc_status != IBT_WC_SUCCESS) {
6462 				/*
6463 				 * Channel being torn down.
6464 				 */
6465 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
6466 					DPRINT(5, "ibd_drain_rcq: "
6467 					    "expected flushed rwqe");
6468 				} else {
6469 					DPRINT(5, "ibd_drain_rcq: "
6470 					    "unexpected wc_status %d",
6471 					    wc->wc_status);
6472 				}
6473 				atomic_inc_32(
6474 				    &state->id_rx_list.dl_bufs_outstanding);
6475 				freemsg(rwqe->rwqe_im_mblk);
6476 				continue;
6477 			}
6478 			mp = ibd_process_rx(state, rwqe, wc);
6479 			if (mp == NULL)
6480 				continue;
6481 
6482 			/*
6483 			 * Add this mp to the list to send to the nw layer.
6484 			 */
6485 			if (head)
6486 				tail->b_next = mp;
6487 			else
6488 				head = mp;
6489 			tail = mp;
6490 		}
6491 		if (head)
6492 			mac_rx(state->id_mh, state->id_rh, head);
6493 
6494 		/*
6495 		 * Account for #rwqes polled.
6496 		 * Post more here, if less than one fourth full.
6497 		 */
6498 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
6499 		    (state->id_num_rwqe / 4))
6500 			ibd_post_recv_intr(state);
6501 	}
6502 }
6503 
6504 /*
6505  * Common code for interrupt handling as well as for polling
6506  * for all completed wqe's while detaching.
6507  */
6508 static void
6509 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6510 {
6511 	int flag, redo_flag;
6512 	int redo = 1;
6513 
6514 	flag = IBD_CQ_POLLING;
6515 	redo_flag = IBD_REDO_CQ_POLLING;
6516 
6517 	mutex_enter(&state->id_scq_poll_lock);
6518 	if (state->id_scq_poll_busy & flag) {
6519 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
6520 		state->id_scq_poll_busy |= redo_flag;
6521 		mutex_exit(&state->id_scq_poll_lock);
6522 		return;
6523 	}
6524 	state->id_scq_poll_busy |= flag;
6525 	mutex_exit(&state->id_scq_poll_lock);
6526 
6527 	/*
6528 	 * In some cases (eg detaching), this code can be invoked on
6529 	 * any cpu after disabling cq notification (thus no concurrency
6530 	 * exists). Apart from that, the following applies normally:
6531 	 * Transmit completion handling could be from any cpu if
6532 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
6533 	 * is interrupt driven.
6534 	 */
6535 
6536 	/*
6537 	 * Poll and drain the CQ
6538 	 */
6539 	ibd_drain_scq(state, cq_hdl);
6540 
6541 	/*
6542 	 * Enable CQ notifications and redrain the cq to catch any
6543 	 * completions we might have missed after the ibd_drain_scq()
6544 	 * above and before the ibt_enable_cq_notify() that follows.
6545 	 * Finally, service any new requests to poll the cq that
6546 	 * could've come in after the ibt_enable_cq_notify().
6547 	 */
6548 	do {
6549 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
6550 		    IBT_SUCCESS) {
6551 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6552 		}
6553 
6554 		ibd_drain_scq(state, cq_hdl);
6555 
6556 		mutex_enter(&state->id_scq_poll_lock);
6557 		if (state->id_scq_poll_busy & redo_flag)
6558 			state->id_scq_poll_busy &= ~redo_flag;
6559 		else {
6560 			state->id_scq_poll_busy &= ~flag;
6561 			redo = 0;
6562 		}
6563 		mutex_exit(&state->id_scq_poll_lock);
6564 
6565 	} while (redo);
6566 }
6567 
6568 /*
6569  * Common code for interrupt handling as well as for polling
6570  * for all completed wqe's while detaching.
6571  */
6572 static void
6573 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
6574 {
6575 	int flag, redo_flag;
6576 	int redo = 1;
6577 
6578 	flag = IBD_CQ_POLLING;
6579 	redo_flag = IBD_REDO_CQ_POLLING;
6580 
6581 	mutex_enter(&state->id_rcq_poll_lock);
6582 	if (state->id_rcq_poll_busy & flag) {
6583 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
6584 		state->id_rcq_poll_busy |= redo_flag;
6585 		mutex_exit(&state->id_rcq_poll_lock);
6586 		return;
6587 	}
6588 	state->id_rcq_poll_busy |= flag;
6589 	mutex_exit(&state->id_rcq_poll_lock);
6590 
6591 	/*
6592 	 * Poll and drain the CQ
6593 	 */
6594 	ibd_drain_rcq(state, rcq);
6595 
6596 	/*
6597 	 * Enable CQ notifications and redrain the cq to catch any
6598 	 * completions we might have missed after the ibd_drain_cq()
6599 	 * above and before the ibt_enable_cq_notify() that follows.
6600 	 * Finally, service any new requests to poll the cq that
6601 	 * could've come in after the ibt_enable_cq_notify().
6602 	 */
6603 	do {
6604 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
6605 		    IBT_SUCCESS) {
6606 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6607 		}
6608 
6609 		ibd_drain_rcq(state, rcq);
6610 
6611 		mutex_enter(&state->id_rcq_poll_lock);
6612 		if (state->id_rcq_poll_busy & redo_flag)
6613 			state->id_rcq_poll_busy &= ~redo_flag;
6614 		else {
6615 			state->id_rcq_poll_busy &= ~flag;
6616 			redo = 0;
6617 		}
6618 		mutex_exit(&state->id_rcq_poll_lock);
6619 
6620 	} while (redo);
6621 }
6622 
6623 /*
6624  * Unmap the memory area associated with a given swqe.
6625  */
6626 void
6627 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
6628 {
6629 	ibt_status_t stat;
6630 
6631 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6632 
6633 	if (swqe->w_mi_hdl) {
6634 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6635 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6636 			DPRINT(10,
6637 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6638 		}
6639 		swqe->w_mi_hdl = NULL;
6640 	}
6641 	swqe->w_swr.wr_nds = 0;
6642 }
6643 
6644 void
6645 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
6646 {
6647 	/*
6648 	 * The recycling logic can be eliminated from here
6649 	 * and put into the async thread if we create another
6650 	 * list to hold ACE's for unjoined mcg's.
6651 	 */
6652 	if (DEC_REF_DO_CYCLE(ace)) {
6653 		ibd_mce_t *mce;
6654 
6655 		/*
6656 		 * Check with the lock taken: we decremented
6657 		 * reference count without the lock, and some
6658 		 * transmitter might already have bumped the
6659 		 * reference count (possible in case of multicast
6660 		 * disable when we leave the AH on the active
6661 		 * list). If not still 0, get out, leaving the
6662 		 * recycle bit intact.
6663 		 *
6664 		 * Atomically transition the AH from active
6665 		 * to free list, and queue a work request to
6666 		 * leave the group and destroy the mce. No
6667 		 * transmitter can be looking at the AH or
6668 		 * the MCE in between, since we have the
6669 		 * ac_mutex lock. In the SendOnly reap case,
6670 		 * it is not necessary to hold the ac_mutex
6671 		 * and recheck the ref count (since the AH was
6672 		 * taken off the active list), we just do it
6673 		 * to have uniform processing with the Full
6674 		 * reap case.
6675 		 */
6676 		mutex_enter(&state->id_ac_mutex);
6677 		mce = ace->ac_mce;
6678 		if (GET_REF_CYCLE(ace) == 0) {
6679 			CLEAR_REFCYCLE(ace);
6680 			/*
6681 			 * Identify the case of fullmember reap as
6682 			 * opposed to mcg trap reap. Also, port up
6683 			 * might set ac_mce to NULL to indicate Tx
6684 			 * cleanup should do no more than put the
6685 			 * AH in the free list (see ibd_async_link).
6686 			 */
6687 			if (mce != NULL) {
6688 				ace->ac_mce = NULL;
6689 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6690 				/*
6691 				 * mc_req was initialized at mce
6692 				 * creation time.
6693 				 */
6694 				ibd_queue_work_slot(state,
6695 				    &mce->mc_req, IBD_ASYNC_REAP);
6696 			}
6697 			IBD_ACACHE_INSERT_FREE(state, ace);
6698 		}
6699 		mutex_exit(&state->id_ac_mutex);
6700 	}
6701 }
6702 
6703 /*
6704  * Common code that deals with clean ups after a successful or
6705  * erroneous transmission attempt.
6706  */
6707 static void
6708 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6709 {
6710 	ibd_ace_t *ace = swqe->w_ahandle;
6711 
6712 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6713 
6714 	/*
6715 	 * If this was a dynamic mapping in ibd_send(), we need to
6716 	 * unmap here. If this was an lso buffer we'd used for sending,
6717 	 * we need to release the lso buf to the pool, since the resource
6718 	 * is scarce. However, if this was simply a normal send using
6719 	 * the copybuf (present in each swqe), we don't need to release it.
6720 	 */
6721 	if (swqe->swqe_im_mblk != NULL) {
6722 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6723 			ibd_unmap_mem(state, swqe);
6724 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6725 			ibd_release_lsobufs(state,
6726 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6727 		}
6728 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6729 		freemsg(swqe->swqe_im_mblk);
6730 		swqe->swqe_im_mblk = NULL;
6731 	}
6732 
6733 	/*
6734 	 * Drop the reference count on the AH; it can be reused
6735 	 * now for a different destination if there are no more
6736 	 * posted sends that will use it. This can be eliminated
6737 	 * if we can always associate each Tx buffer with an AH.
6738 	 * The ace can be null if we are cleaning up from the
6739 	 * ibd_send() error path.
6740 	 */
6741 	if (ace != NULL) {
6742 		ibd_dec_ref_ace(state, ace);
6743 	}
6744 
6745 	/*
6746 	 * Release the send wqe for reuse.
6747 	 */
6748 	swqe->swqe_next = NULL;
6749 	ibd_release_swqe(state, swqe, swqe, 1);
6750 }
6751 
6752 static void
6753 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
6754 {
6755 	ibd_ace_t *ace;
6756 	ibd_swqe_t *swqe;
6757 	int n = 0;
6758 
6759 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
6760 
6761 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
6762 
6763 		/*
6764 		 * If this was a dynamic mapping in ibd_send(), we need to
6765 		 * unmap here. If this was an lso buffer we'd used for sending,
6766 		 * we need to release the lso buf to the pool, since the
6767 		 * resource is scarce. However, if this was simply a normal
6768 		 * send using the copybuf (present in each swqe), we don't need
6769 		 * to release it.
6770 		 */
6771 		if (swqe->swqe_im_mblk != NULL) {
6772 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
6773 				ibd_unmap_mem(state, swqe);
6774 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6775 				ibd_release_lsobufs(state,
6776 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6777 			}
6778 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6779 			freemsg(swqe->swqe_im_mblk);
6780 			swqe->swqe_im_mblk = NULL;
6781 		}
6782 
6783 		/*
6784 		 * Drop the reference count on the AH; it can be reused
6785 		 * now for a different destination if there are no more
6786 		 * posted sends that will use it. This can be eliminated
6787 		 * if we can always associate each Tx buffer with an AH.
6788 		 * The ace can be null if we are cleaning up from the
6789 		 * ibd_send() error path.
6790 		 */
6791 		ace = swqe->w_ahandle;
6792 		if (ace != NULL) {
6793 			ibd_dec_ref_ace(state, ace);
6794 		}
6795 		n++;
6796 	}
6797 
6798 	/*
6799 	 * Release the send wqes for reuse.
6800 	 */
6801 	ibd_release_swqe(state, head, tail, n);
6802 }
6803 
6804 /*
6805  * Processing to be done after receipt of a packet; hand off to GLD
6806  * in the format expected by GLD.  The received packet has this
6807  * format: 2b sap :: 00 :: data.
6808  */
6809 static mblk_t *
6810 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6811 {
6812 	ib_header_info_t *phdr;
6813 	mblk_t *mp;
6814 	ipoib_hdr_t *ipibp;
6815 	ipha_t *iphap;
6816 	ip6_t *ip6h;
6817 	int len;
6818 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
6819 	uint32_t bufs;
6820 
6821 	/*
6822 	 * Track number handed to upper layer that need to be returned.
6823 	 */
6824 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
6825 
6826 	/* Never run out of rwqes, use allocb when running low */
6827 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
6828 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6829 		atomic_inc_32(&state->id_rx_allocb);
6830 		mp = allocb(pkt_len, BPRI_HI);
6831 		if (mp) {
6832 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
6833 			ibd_post_recv(state, rwqe);
6834 		} else {	/* no memory */
6835 			atomic_inc_32(&state->id_rx_allocb_failed);
6836 			ibd_post_recv(state, rwqe);
6837 			return (NULL);
6838 		}
6839 	} else {
6840 		mp = rwqe->rwqe_im_mblk;
6841 	}
6842 
6843 
6844 	/*
6845 	 * Adjust write pointer depending on how much data came in.
6846 	 */
6847 	mp->b_wptr = mp->b_rptr + pkt_len;
6848 
6849 	/*
6850 	 * Make sure this is NULL or we're in trouble.
6851 	 */
6852 	if (mp->b_next != NULL) {
6853 		ibd_print_warn(state,
6854 		    "ibd_process_rx: got duplicate mp from rcq?");
6855 		mp->b_next = NULL;
6856 	}
6857 
6858 	/*
6859 	 * the IB link will deliver one of the IB link layer
6860 	 * headers called, the Global Routing Header (GRH).
6861 	 * ibd driver uses the information in GRH to build the
6862 	 * Header_info structure and pass it with the datagram up
6863 	 * to GLDv3.
6864 	 * If the GRH is not valid, indicate to GLDv3 by setting
6865 	 * the VerTcFlow field to 0.
6866 	 */
6867 	phdr = (ib_header_info_t *)mp->b_rptr;
6868 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6869 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6870 
6871 		/* if it is loop back packet, just drop it. */
6872 		if (state->id_enable_rc) {
6873 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
6874 			    &state->rc_macaddr_loopback,
6875 			    IPOIB_ADDRL) == 0) {
6876 				freemsg(mp);
6877 				return (NULL);
6878 			}
6879 		} else {
6880 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6881 			    IPOIB_ADDRL) == 0) {
6882 				freemsg(mp);
6883 				return (NULL);
6884 			}
6885 		}
6886 
6887 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6888 		    sizeof (ipoib_mac_t));
6889 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6890 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6891 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6892 		} else {
6893 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6894 		}
6895 	} else {
6896 		/*
6897 		 * It can not be a IBA multicast packet. Must have been
6898 		 * unicast for us. Just copy the interface address to dst.
6899 		 */
6900 		phdr->ib_grh.ipoib_vertcflow = 0;
6901 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6902 		    sizeof (ipoib_mac_t));
6903 	}
6904 
6905 	/*
6906 	 * For ND6 packets, padding is at the front of the source/target
6907 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6908 	 * the padding from such packets.
6909 	 */
6910 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6911 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
6912 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6913 		len = ntohs(ip6h->ip6_plen);
6914 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6915 			/* LINTED: E_CONSTANT_CONDITION */
6916 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6917 		}
6918 	}
6919 
6920 	/*
6921 	 * Update statistics
6922 	 */
6923 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
6924 	atomic_inc_64(&state->id_rcv_pkt);
6925 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6926 		atomic_inc_64(&state->id_brd_rcv);
6927 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6928 		atomic_inc_64(&state->id_multi_rcv);
6929 
6930 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6931 	/*
6932 	 * Set receive checksum status in mp
6933 	 * Hardware checksumming can be considered valid only if:
6934 	 * 1. CQE.IP_OK bit is set
6935 	 * 2. CQE.CKSUM = 0xffff
6936 	 * 3. IPv6 routing header is not present in the packet
6937 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6938 	 */
6939 
6940 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6941 	    (wc->wc_cksum == 0xFFFF) &&
6942 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6943 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6944 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6945 	}
6946 
6947 	return (mp);
6948 }
6949 
6950 /*
6951  * Callback code invoked from STREAMs when the receive data buffer is
6952  * free for recycling.
6953  */
6954 static void
6955 ibd_freemsg_cb(char *arg)
6956 {
6957 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6958 	ibd_state_t *state = rwqe->w_state;
6959 
6960 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6961 
6962 	/*
6963 	 * If the driver is stopped, just free the rwqe.
6964 	 */
6965 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
6966 		DPRINT(6, "ibd_freemsg: wqe being freed");
6967 		rwqe->rwqe_im_mblk = NULL;
6968 		ibd_free_rwqe(state, rwqe);
6969 		return;
6970 	}
6971 
6972 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6973 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6974 	if (rwqe->rwqe_im_mblk == NULL) {
6975 		ibd_free_rwqe(state, rwqe);
6976 		DPRINT(6, "ibd_freemsg: desballoc failed");
6977 		return;
6978 	}
6979 
6980 	ibd_post_recv(state, rwqe);
6981 }
6982 
6983 static uint_t
6984 ibd_tx_recycle(caddr_t arg)
6985 {
6986 	ibd_state_t *state = (ibd_state_t *)arg;
6987 
6988 	/*
6989 	 * Poll for completed entries
6990 	 */
6991 	ibd_poll_scq(state, state->id_scq_hdl);
6992 
6993 	return (DDI_INTR_CLAIMED);
6994 }
6995 
6996 #ifdef IBD_LOGGING
6997 static void
6998 ibd_log_init(void)
6999 {
7000 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
7001 	ibd_lbuf_ndx = 0;
7002 
7003 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
7004 }
7005 
7006 static void
7007 ibd_log_fini(void)
7008 {
7009 	if (ibd_lbuf)
7010 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
7011 	ibd_lbuf_ndx = 0;
7012 	ibd_lbuf = NULL;
7013 
7014 	mutex_destroy(&ibd_lbuf_lock);
7015 }
7016 
7017 static void
7018 ibd_log(const char *fmt, ...)
7019 {
7020 	va_list	ap;
7021 	uint32_t off;
7022 	uint32_t msglen;
7023 	char tmpbuf[IBD_DMAX_LINE];
7024 
7025 	if (ibd_lbuf == NULL)
7026 		return;
7027 
7028 	va_start(ap, fmt);
7029 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
7030 	va_end(ap);
7031 
7032 	if (msglen >= IBD_DMAX_LINE)
7033 		msglen = IBD_DMAX_LINE - 1;
7034 
7035 	mutex_enter(&ibd_lbuf_lock);
7036 
7037 	off = ibd_lbuf_ndx;		/* current msg should go here */
7038 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
7039 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
7040 
7041 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
7042 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
7043 
7044 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
7045 		ibd_lbuf_ndx = 0;
7046 
7047 	mutex_exit(&ibd_lbuf_lock);
7048 
7049 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
7050 }
7051 #endif
7052