xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 45680bd3312426f0b2a9e53e7b78a09c1fff0959)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip6.h>		/* for ip6_t */
54 #include <inet/tcp.h>		/* for tcph_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/multidata.h>
63 
64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
65 
66 /*
67  * Per-interface tunables
68  *
69  * ibd_tx_copy_thresh
70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
72  *     is restricted by various parameters, so setting of this value must be
73  *     made after careful considerations only.  For instance, IB HCAs currently
74  *     impose a relatively small limit (when compared to ethernet NICs) on the
75  *     length of the SGL for transmit. On the other hand, the ip stack could
76  *     send down mp chains that are quite long when LSO is enabled.
77  *
78  * ibd_num_swqe
79  *     Number of "send WQE" elements that will be allocated and used by ibd.
80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
81  *     buffer in each of these send wqes must be taken into account. This
82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
84  *     changed independently if needed).
85  *
86  * ibd_num_rwqe
87  *     Number of "receive WQE" elements that will be allocated and used by
88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
89  *     Each buffer in the receive wqe will be of MTU size.
90  *
91  * ibd_num_lso_bufs
92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
94  *     and too large to be used with regular MTU-sized copy buffers. It is
95  *     not recommended to tune this variable without understanding the
96  *     application environment and/or memory resources. The size of each of
97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
98  *
99  * ibd_num_ah
100  *     Number of AH cache entries to allocate
101  *
102  * ibd_hash_size
103  *     Hash table size for the active AH list
104  *
105  * ibd_separate_cqs
106  * ibd_txcomp_poll
107  *     These boolean variables (1 or 0) may be used to tune the behavior of
108  *     ibd in managing the send and receive completion queues and in deciding
109  *     whether or not transmit completions should be polled or interrupt
110  *     driven (when the completion queues are separate). If both the completion
111  *     queues are interrupt driven, it may not be possible for the handlers to
112  *     be invoked concurrently, depending on how the interrupts are tied on
113  *     the PCI intr line.  Note that some combination of these two parameters
114  *     may not be meaningful (and therefore not allowed).
115  *
116  * ibd_tx_softintr
117  * ibd_rx_softintr
118  *     The softintr mechanism allows ibd to avoid event queue overflows if
119  *     the receive/completion handlers are to be expensive. These are enabled
120  *     by default.
121  *
122  * ibd_log_sz
123  *     This specifies the size of the ibd log buffer in bytes. The buffer is
124  *     allocated and logging is enabled only when IBD_LOGGING is defined.
125  *
126  */
127 uint_t ibd_tx_copy_thresh = 0x1000;
128 uint_t ibd_num_swqe = 4000;
129 uint_t ibd_num_rwqe = 4000;
130 uint_t ibd_num_lso_bufs = 0x400;
131 uint_t ibd_num_ah = 64;
132 uint_t ibd_hash_size = 32;
133 uint_t ibd_separate_cqs = 1;
134 uint_t ibd_txcomp_poll = 0;
135 uint_t ibd_rx_softintr = 1;
136 uint_t ibd_tx_softintr = 1;
137 uint_t ibd_create_broadcast_group = 1;
138 #ifdef IBD_LOGGING
139 uint_t ibd_log_sz = 0x20000;
140 #endif
141 
142 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
143 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
144 #define	IBD_NUM_SWQE			ibd_num_swqe
145 #define	IBD_NUM_RWQE			ibd_num_rwqe
146 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
147 #define	IBD_NUM_AH			ibd_num_ah
148 #define	IBD_HASH_SIZE			ibd_hash_size
149 #ifdef IBD_LOGGING
150 #define	IBD_LOG_SZ			ibd_log_sz
151 #endif
152 
153 /*
154  * Receive CQ moderation parameters: NOT tunables
155  */
156 static uint_t ibd_rxcomp_count = 4;
157 static uint_t ibd_rxcomp_usec = 10;
158 
159 /*
160  * Send CQ moderation parameters: NOT tunables
161  */
162 #define	IBD_TXCOMP_COUNT		10
163 #define	IBD_TXCOMP_USEC			300
164 
165 /*
166  * Thresholds
167  *
168  * When waiting for resources (swqes or lso buffers) to become available,
169  * the first two thresholds below determine how long to wait before informing
170  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
171  * determines how low the available swqes should go before we start polling
172  * the completion queue.
173  */
174 #define	IBD_FREE_LSOS_THRESH		8
175 #define	IBD_FREE_SWQES_THRESH		20
176 #define	IBD_TX_POLL_THRESH		80
177 
178 /*
179  * When doing multiple-send-wr or multiple-recv-wr posts, this value
180  * determines how many to do at a time (in a single ibt_post_send/recv).
181  */
182 #define	IBD_MAX_POST_MULTIPLE		4
183 
184 /*
185  * Maximum length for returning chained mps back to crossbow
186  */
187 #define	IBD_MAX_RX_MP_LEN		16
188 
189 /*
190  * LSO parameters
191  */
192 #define	IBD_LSO_MAXLEN			65536
193 #define	IBD_LSO_BUFSZ			8192
194 #define	IBD_PROP_LSO_POLICY		"lso-policy"
195 
196 /*
197  * Completion queue polling control
198  */
199 #define	IBD_RX_CQ_POLLING		0x1
200 #define	IBD_TX_CQ_POLLING		0x2
201 #define	IBD_REDO_RX_CQ_POLLING		0x4
202 #define	IBD_REDO_TX_CQ_POLLING		0x8
203 
204 /*
205  * Flag bits for resources to reap
206  */
207 #define	IBD_RSRC_SWQE			0x1
208 #define	IBD_RSRC_LSOBUF			0x2
209 
210 /*
211  * Async operation types
212  */
213 #define	IBD_ASYNC_GETAH			1
214 #define	IBD_ASYNC_JOIN			2
215 #define	IBD_ASYNC_LEAVE			3
216 #define	IBD_ASYNC_PROMON		4
217 #define	IBD_ASYNC_PROMOFF		5
218 #define	IBD_ASYNC_REAP			6
219 #define	IBD_ASYNC_TRAP			7
220 #define	IBD_ASYNC_SCHED			8
221 #define	IBD_ASYNC_LINK			9
222 #define	IBD_ASYNC_EXIT			10
223 
224 /*
225  * Async operation states
226  */
227 #define	IBD_OP_NOTSTARTED		0
228 #define	IBD_OP_ONGOING			1
229 #define	IBD_OP_COMPLETED		2
230 #define	IBD_OP_ERRORED			3
231 #define	IBD_OP_ROUTERED			4
232 
233 /*
234  * State of IBD driver initialization during attach/m_start
235  */
236 #define	IBD_DRV_STATE_INITIALIZED	0x00001
237 #define	IBD_DRV_RXINTR_ADDED		0x00002
238 #define	IBD_DRV_TXINTR_ADDED		0x00004
239 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
240 #define	IBD_DRV_HCA_OPENED		0x00010
241 #define	IBD_DRV_PD_ALLOCD		0x00020
242 #define	IBD_DRV_MAC_REGISTERED		0x00040
243 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
244 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
245 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
246 #define	IBD_DRV_CQS_ALLOCD		0x00400
247 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
248 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
249 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
250 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
251 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
252 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
253 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
254 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
255 #define	IBD_DRV_STARTED			0x80000
256 
257 /*
258  * Start/stop in-progress flags; note that restart must always remain
259  * the OR of start and stop flag values.
260  */
261 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
262 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
263 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
264 
265 /*
266  * Miscellaneous constants
267  */
268 #define	IBD_SEND			0
269 #define	IBD_RECV			1
270 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
271 #define	IBD_DEF_MAX_SDU			2044
272 #define	IBD_DEFAULT_QKEY		0xB1B
273 #ifdef IBD_LOGGING
274 #define	IBD_DMAX_LINE			100
275 #endif
276 
277 /*
278  * Enumerations for link states
279  */
280 typedef enum {
281 	IBD_LINK_DOWN,
282 	IBD_LINK_UP,
283 	IBD_LINK_UP_ABSENT
284 } ibd_link_op_t;
285 
286 /*
287  * Driver State Pointer
288  */
289 void *ibd_list;
290 
291 /*
292  * Logging
293  */
294 #ifdef IBD_LOGGING
295 kmutex_t ibd_lbuf_lock;
296 uint8_t *ibd_lbuf;
297 uint32_t ibd_lbuf_ndx;
298 #endif
299 
300 /*
301  * Required system entry points
302  */
303 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
304 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
305 
306 /*
307  * Required driver entry points for GLDv3
308  */
309 static int ibd_m_stat(void *, uint_t, uint64_t *);
310 static int ibd_m_start(void *);
311 static void ibd_m_stop(void *);
312 static int ibd_m_promisc(void *, boolean_t);
313 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
314 static int ibd_m_unicst(void *, const uint8_t *);
315 static mblk_t *ibd_m_tx(void *, mblk_t *);
316 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
317 
318 /*
319  * Private driver entry points for GLDv3
320  */
321 
322 /*
323  * Initialization
324  */
325 static int ibd_state_init(ibd_state_t *, dev_info_t *);
326 static int ibd_init_txlist(ibd_state_t *);
327 static int ibd_init_rxlist(ibd_state_t *);
328 static int ibd_acache_init(ibd_state_t *);
329 #ifdef IBD_LOGGING
330 static void ibd_log_init(void);
331 #endif
332 
333 /*
334  * Termination/cleanup
335  */
336 static void ibd_state_fini(ibd_state_t *);
337 static void ibd_fini_txlist(ibd_state_t *);
338 static void ibd_fini_rxlist(ibd_state_t *);
339 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
340 static void ibd_acache_fini(ibd_state_t *);
341 #ifdef IBD_LOGGING
342 static void ibd_log_fini(void);
343 #endif
344 
345 /*
346  * Allocation/acquire/map routines
347  */
348 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
349 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
350 static int ibd_alloc_tx_copybufs(ibd_state_t *);
351 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
352 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
353 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
354     uint32_t *);
355 
356 /*
357  * Free/release/unmap routines
358  */
359 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
360 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
361 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
362 static void ibd_free_tx_copybufs(ibd_state_t *);
363 static void ibd_free_tx_lsobufs(ibd_state_t *);
364 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
365 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
366 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
367 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
368 
369 /*
370  * Handlers/callback routines
371  */
372 static uint_t ibd_intr(char *);
373 static uint_t ibd_tx_recycle(char *);
374 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
375 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
376 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
377 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
378 static void ibd_freemsg_cb(char *);
379 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
380     ibt_async_event_t *);
381 static void ibd_snet_notices_handler(void *, ib_gid_t,
382     ibt_subnet_event_code_t, ibt_subnet_event_t *);
383 
384 /*
385  * Send/receive routines
386  */
387 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
388 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
389 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
390 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
391 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
392 
393 /*
394  * Threads
395  */
396 static void ibd_async_work(ibd_state_t *);
397 
398 /*
399  * Async tasks
400  */
401 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
402 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
403 static void ibd_async_setprom(ibd_state_t *);
404 static void ibd_async_unsetprom(ibd_state_t *);
405 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
406 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
407 static void ibd_async_txsched(ibd_state_t *);
408 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
409 
410 /*
411  * Async task helpers
412  */
413 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
414 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
415 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
416 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
417     ipoib_mac_t *, ipoib_mac_t *);
418 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
419 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
420 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
421 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
422 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
423 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
424 static uint64_t ibd_get_portspeed(ibd_state_t *);
425 static boolean_t ibd_async_safe(ibd_state_t *);
426 static void ibd_async_done(ibd_state_t *);
427 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
428 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
429 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
430 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
431 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
432 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
433 
434 /*
435  * Helpers for attach/start routines
436  */
437 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
438 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
439 static int ibd_unattach(ibd_state_t *, dev_info_t *);
440 static int ibd_get_port_details(ibd_state_t *);
441 static int ibd_alloc_cqs(ibd_state_t *);
442 static int ibd_setup_ud_channel(ibd_state_t *);
443 static int ibd_start(ibd_state_t *);
444 static int ibd_undo_start(ibd_state_t *, link_state_t);
445 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
446 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
447 
448 
449 /*
450  * Miscellaneous helpers
451  */
452 static int ibd_sched_poll(ibd_state_t *, int, int);
453 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
454 static int ibd_resume_transmission(ibd_state_t *);
455 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
456 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
457 static void *list_get_head(list_t *);
458 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
459 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
460 static void ibd_print_warn(ibd_state_t *, char *, ...);
461 #ifdef IBD_LOGGING
462 static void ibd_log(const char *, ...);
463 #endif
464 
465 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
466     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
467 
468 /* Module Driver Info */
469 static struct modldrv ibd_modldrv = {
470 	&mod_driverops,			/* This one is a driver */
471 	"InfiniBand GLDv3 Driver",	/* short description */
472 	&ibd_dev_ops			/* driver specific ops */
473 };
474 
475 /* Module Linkage */
476 static struct modlinkage ibd_modlinkage = {
477 	MODREV_1, (void *)&ibd_modldrv, NULL
478 };
479 
480 /*
481  * Module (static) info passed to IBTL during ibt_attach
482  */
483 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
484 	IBTI_V_CURR,
485 	IBT_NETWORK,
486 	ibd_async_handler,
487 	NULL,
488 	"IPIB"
489 };
490 
491 /*
492  * GLDv3 entry points
493  */
494 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
495 static mac_callbacks_t ibd_m_callbacks = {
496 	IBD_M_CALLBACK_FLAGS,
497 	ibd_m_stat,
498 	ibd_m_start,
499 	ibd_m_stop,
500 	ibd_m_promisc,
501 	ibd_m_multicst,
502 	ibd_m_unicst,
503 	ibd_m_tx,
504 	NULL,
505 	ibd_m_getcapab
506 };
507 
508 /*
509  * Fill/clear <scope> and <p_key> in multicast/broadcast address
510  */
511 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
512 {							\
513 	*(uint32_t *)((char *)(maddr) + 4) |=		\
514 	    htonl((uint32_t)(scope) << 16);		\
515 	*(uint32_t *)((char *)(maddr) + 8) |=		\
516 	    htonl((uint32_t)(pkey) << 16);		\
517 }
518 
519 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
520 {							\
521 	*(uint32_t *)((char *)(maddr) + 4) &=		\
522 	    htonl(~((uint32_t)0xF << 16));		\
523 	*(uint32_t *)((char *)(maddr) + 8) &=		\
524 	    htonl(~((uint32_t)0xFFFF << 16));		\
525 }
526 
527 /*
528  * Rudimentary debugging support
529  */
530 #ifdef DEBUG
531 int ibd_debuglevel = 100;
532 static void
533 debug_print(int l, char *fmt, ...)
534 {
535 	va_list ap;
536 
537 	if (l < ibd_debuglevel)
538 		return;
539 	va_start(ap, fmt);
540 	vcmn_err(CE_CONT, fmt, ap);
541 	va_end(ap);
542 }
543 #define	DPRINT		debug_print
544 #else
545 #define	DPRINT
546 #endif
547 
548 /*
549  * Common routine to print warning messages; adds in hca guid, port number
550  * and pkey to be able to identify the IBA interface.
551  */
552 static void
553 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
554 {
555 	ib_guid_t hca_guid;
556 	char ibd_print_buf[256];
557 	int len;
558 	va_list ap;
559 
560 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
561 	    0, "hca-guid", 0);
562 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
563 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
564 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
565 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
566 	va_start(ap, fmt);
567 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
568 	    fmt, ap);
569 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
570 	va_end(ap);
571 }
572 
573 /*
574  * Warlock directives
575  */
576 
577 /*
578  * id_lso_lock
579  *
580  * state->id_lso->bkt_nfree may be accessed without a lock to
581  * determine the threshold at which we have to ask the nw layer
582  * to resume transmission (see ibd_resume_transmission()).
583  */
584 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
585     ibd_state_t::id_lso))
586 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
587 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
588 
589 /*
590  * id_cq_poll_lock
591  */
592 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
593     ibd_state_t::id_cq_poll_busy))
594 
595 /*
596  * id_txpost_lock
597  */
598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
599     ibd_state_t::id_tx_head))
600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
601     ibd_state_t::id_tx_busy))
602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
603     ibd_state_t::id_tx_tailp))
604 
605 /*
606  * id_rxpost_lock
607  */
608 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
609     ibd_state_t::id_rx_head))
610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
611     ibd_state_t::id_rx_busy))
612 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
613     ibd_state_t::id_rx_tailp))
614 
615 /*
616  * id_acache_req_lock
617  */
618 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
619     ibd_state_t::id_acache_req_cv))
620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
621     ibd_state_t::id_req_list))
622 
623 /*
624  * id_ac_mutex
625  *
626  * This mutex is actually supposed to protect id_ah_op as well,
627  * but this path of the code isn't clean (see update of id_ah_op
628  * in ibd_async_acache(), immediately after the call to
629  * ibd_async_mcache()). For now, we'll skip this check by
630  * declaring that id_ah_op is protected by some internal scheme
631  * that warlock isn't aware of.
632  */
633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
634     ibd_state_t::id_ah_active))
635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
636     ibd_state_t::id_ah_free))
637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
638     ibd_state_t::id_ah_addr))
639 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
640     ibd_state_t::id_ah_op))
641 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
642     ibd_state_t::id_ah_error))
643 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
644 
645 /*
646  * id_mc_mutex
647  */
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
649     ibd_state_t::id_mc_full))
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
651     ibd_state_t::id_mc_non))
652 
653 /*
654  * id_trap_lock
655  */
656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
657     ibd_state_t::id_trap_cv))
658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
659     ibd_state_t::id_trap_stop))
660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
661     ibd_state_t::id_trap_inprog))
662 
663 /*
664  * id_prom_op
665  */
666 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
667     ibd_state_t::id_prom_op))
668 
669 /*
670  * id_sched_lock
671  */
672 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
673     ibd_state_t::id_sched_needed))
674 
675 /*
676  * id_link_mutex
677  */
678 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
679     ibd_state_t::id_link_state))
680 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
681 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
682     ibd_state_t::id_link_speed))
683 
684 /*
685  * id_tx_list.dl_mutex
686  */
687 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
688     ibd_state_t::id_tx_list.dl_head))
689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
690     ibd_state_t::id_tx_list.dl_tail))
691 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
692     ibd_state_t::id_tx_list.dl_pending_sends))
693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
694     ibd_state_t::id_tx_list.dl_cnt))
695 
696 /*
697  * id_rx_list.dl_mutex
698  */
699 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
700     ibd_state_t::id_rx_list.dl_head))
701 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
702     ibd_state_t::id_rx_list.dl_tail))
703 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
704     ibd_state_t::id_rx_list.dl_bufs_outstanding))
705 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
706     ibd_state_t::id_rx_list.dl_cnt))
707 
708 
709 /*
710  * Items protected by atomic updates
711  */
712 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
713     ibd_state_s::id_brd_rcv
714     ibd_state_s::id_brd_xmt
715     ibd_state_s::id_multi_rcv
716     ibd_state_s::id_multi_xmt
717     ibd_state_s::id_num_intrs
718     ibd_state_s::id_rcv_bytes
719     ibd_state_s::id_rcv_pkt
720     ibd_state_s::id_tx_short
721     ibd_state_s::id_xmt_bytes
722     ibd_state_s::id_xmt_pkt))
723 
724 /*
725  * Non-mutex protection schemes for data elements. Almost all of
726  * these are non-shared items.
727  */
728 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
729     callb_cpr
730     ib_gid_s
731     ib_header_info
732     ibd_acache_rq
733     ibd_acache_s::ac_mce
734     ibd_mcache::mc_fullreap
735     ibd_mcache::mc_jstate
736     ibd_mcache::mc_req
737     ibd_rwqe_s
738     ibd_swqe_s
739     ibd_wqe_s
740     ibt_wr_ds_s::ds_va
741     ibt_wr_lso_s
742     ipoib_mac::ipoib_qpn
743     mac_capab_lso_s
744     msgb::b_next
745     msgb::b_rptr
746     msgb::b_wptr))
747 
748 int
749 _init()
750 {
751 	int status;
752 
753 	/*
754 	 * Sanity check some parameter settings. Tx completion polling
755 	 * only makes sense with separate CQs for Tx and Rx.
756 	 */
757 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
758 		cmn_err(CE_NOTE, "!ibd: %s",
759 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
760 		ibd_txcomp_poll = 0;
761 	}
762 
763 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
764 	if (status != 0) {
765 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
766 		return (status);
767 	}
768 
769 	mac_init_ops(&ibd_dev_ops, "ibd");
770 	status = mod_install(&ibd_modlinkage);
771 	if (status != 0) {
772 		DPRINT(10, "_init:failed in mod_install()");
773 		ddi_soft_state_fini(&ibd_list);
774 		mac_fini_ops(&ibd_dev_ops);
775 		return (status);
776 	}
777 
778 #ifdef IBD_LOGGING
779 	ibd_log_init();
780 #endif
781 	return (0);
782 }
783 
784 int
785 _info(struct modinfo *modinfop)
786 {
787 	return (mod_info(&ibd_modlinkage, modinfop));
788 }
789 
790 int
791 _fini()
792 {
793 	int status;
794 
795 	status = mod_remove(&ibd_modlinkage);
796 	if (status != 0)
797 		return (status);
798 
799 	mac_fini_ops(&ibd_dev_ops);
800 	ddi_soft_state_fini(&ibd_list);
801 #ifdef IBD_LOGGING
802 	ibd_log_fini();
803 #endif
804 	return (0);
805 }
806 
807 /*
808  * Convert the GID part of the mac address from network byte order
809  * to host order.
810  */
811 static void
812 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
813 {
814 	ib_sn_prefix_t nbopref;
815 	ib_guid_t nboguid;
816 
817 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
818 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
819 	dgid->gid_prefix = b2h64(nbopref);
820 	dgid->gid_guid = b2h64(nboguid);
821 }
822 
823 /*
824  * Create the IPoIB address in network byte order from host order inputs.
825  */
826 static void
827 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
828     ib_guid_t guid)
829 {
830 	ib_sn_prefix_t nbopref;
831 	ib_guid_t nboguid;
832 
833 	mac->ipoib_qpn = htonl(qpn);
834 	nbopref = h2b64(prefix);
835 	nboguid = h2b64(guid);
836 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
837 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
838 }
839 
840 /*
841  * Send to the appropriate all-routers group when the IBA multicast group
842  * does not exist, based on whether the target group is v4 or v6.
843  */
844 static boolean_t
845 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
846     ipoib_mac_t *rmac)
847 {
848 	boolean_t retval = B_TRUE;
849 	uint32_t adjscope = state->id_scope << 16;
850 	uint32_t topword;
851 
852 	/*
853 	 * Copy the first 4 bytes in without assuming any alignment of
854 	 * input mac address; this will have IPoIB signature, flags and
855 	 * scope bits.
856 	 */
857 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
858 	topword = ntohl(topword);
859 
860 	/*
861 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
862 	 */
863 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
864 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
865 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
866 		    ((uint32_t)(state->id_pkey << 16))),
867 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
868 	else
869 		/*
870 		 * Does not have proper bits in the mgid address.
871 		 */
872 		retval = B_FALSE;
873 
874 	return (retval);
875 }
876 
877 /*
878  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
879  * front of optional src/tgt link layer address. Right now Solaris inserts
880  * padding by default at the end. The routine which is doing is nce_xmit()
881  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
882  * the packet comes down from IP layer to the IBD driver, it is in the
883  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
884  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
885  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
886  *
887  * The send routine at IBD driver changes this packet as follows:
888  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
889  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
890  * aligned.
891  *
892  * At the receiving side again ibd_process_rx takes the above packet and
893  * removes the two bytes of front padding and inserts it at the end. This
894  * is since the IP layer does not understand padding at the front.
895  */
896 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
897 	uchar_t 	*nd_lla_ptr;					\
898 	icmp6_t 	*icmp6;						\
899 	nd_opt_hdr_t	*opt;						\
900 	int 		i;						\
901 									\
902 	icmp6 = (icmp6_t *)&ip6h[1];					\
903 	len -= sizeof (nd_neighbor_advert_t);				\
904 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
905 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
906 	    (len != 0)) {						\
907 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
908 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
909 		ASSERT(opt != NULL);					\
910 		nd_lla_ptr = (uchar_t *)&opt[1];			\
911 		if (type == IBD_SEND) {					\
912 			for (i = IPOIB_ADDRL; i > 0; i--)		\
913 				*(nd_lla_ptr + i + 1) =			\
914 				    *(nd_lla_ptr + i - 1);		\
915 		} else {						\
916 			for (i = 0; i < IPOIB_ADDRL; i++)		\
917 				*(nd_lla_ptr + i) =			\
918 				    *(nd_lla_ptr + i + 2);		\
919 		}							\
920 		*(nd_lla_ptr + i) = 0;					\
921 		*(nd_lla_ptr + i + 1) = 0;				\
922 	}								\
923 }
924 
925 /*
926  * Address handle entries maintained by the driver are kept in the
927  * free and active lists. Each entry starts out in the free list;
928  * it migrates to the active list when primed using ibt_get_paths()
929  * and ibt_modify_ud_dest() for transmission to a specific destination.
930  * In the active list, the entry has a reference count indicating the
931  * number of ongoing/uncompleted transmits that reference it. The
932  * entry is left in the active list even after the reference count
933  * goes to 0, since successive transmits can find it there and do
934  * not need to set up another entry (ie the path information is
935  * cached using the active list). Entries on the active list are
936  * also hashed using the destination link address as a key for faster
937  * lookups during transmits.
938  *
939  * For any destination address (unicast or multicast, whatever the
940  * join states), there will be at most one entry in the active list.
941  * Entries with a 0 reference count on the active list can be reused
942  * for a transmit to a new destination, if the free list is empty.
943  *
944  * The AH free list insertion/deletion is protected with the id_ac_mutex,
945  * since the async thread and Tx callback handlers insert/delete. The
946  * active list does not need a lock (all operations are done by the
947  * async thread) but updates to the reference count are atomically
948  * done (increments done by Tx path, decrements by the Tx callback handler).
949  */
950 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
951 	list_insert_head(&state->id_ah_free, ce)
952 #define	IBD_ACACHE_GET_FREE(state) \
953 	list_get_head(&state->id_ah_free)
954 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
955 	int _ret_;						\
956 	list_insert_head(&state->id_ah_active, ce);		\
957 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
958 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
959 	ASSERT(_ret_ == 0);					\
960 }
961 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
962 	list_remove(&state->id_ah_active, ce);			\
963 	(void) mod_hash_remove(state->id_ah_active_hash,	\
964 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
965 }
966 #define	IBD_ACACHE_GET_ACTIVE(state) \
967 	list_get_head(&state->id_ah_active)
968 
969 /*
970  * Membership states for different mcg's are tracked by two lists:
971  * the "non" list is used for promiscuous mode, when all mcg traffic
972  * needs to be inspected. This type of membership is never used for
973  * transmission, so there can not be an AH in the active list
974  * corresponding to a member in this list. This list does not need
975  * any protection, since all operations are performed by the async
976  * thread.
977  *
978  * "Full" and "SendOnly" membership is tracked using a single list,
979  * the "full" list. This is because this single list can then be
980  * searched during transmit to a multicast group (if an AH for the
981  * mcg is not found in the active list), since at least one type
982  * of membership must be present before initiating the transmit.
983  * This list is also emptied during driver detach, since sendonly
984  * membership acquired during transmit is dropped at detach time
985  * alongwith ipv4 broadcast full membership. Insert/deletes to
986  * this list are done only by the async thread, but it is also
987  * searched in program context (see multicast disable case), thus
988  * the id_mc_mutex protects the list. The driver detach path also
989  * deconstructs the "full" list, but it ensures that the async
990  * thread will not be accessing the list (by blocking out mcg
991  * trap handling and making sure no more Tx reaping will happen).
992  *
993  * Currently, an IBA attach is done in the SendOnly case too,
994  * although this is not required.
995  */
996 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
997 	list_insert_head(&state->id_mc_full, mce)
998 #define	IBD_MCACHE_INSERT_NON(state, mce) \
999 	list_insert_head(&state->id_mc_non, mce)
1000 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1001 	ibd_mcache_find(mgid, &state->id_mc_full)
1002 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1003 	ibd_mcache_find(mgid, &state->id_mc_non)
1004 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1005 	list_remove(&state->id_mc_full, mce)
1006 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1007 	list_remove(&state->id_mc_non, mce)
1008 
1009 /*
1010  * AH and MCE active list manipulation:
1011  *
1012  * Multicast disable requests and MCG delete traps are two cases
1013  * where the active AH entry for the mcg (if any unreferenced one exists)
1014  * will be moved to the free list (to force the next Tx to the mcg to
1015  * join the MCG in SendOnly mode). Port up handling will also move AHs
1016  * from active to free list.
1017  *
1018  * In the case when some transmits are still pending on an entry
1019  * for an mcg, but a multicast disable has already been issued on the
1020  * mcg, there are some options to consider to preserve the join state
1021  * to ensure the emitted packet is properly routed on the IBA fabric.
1022  * For the AH, we can
1023  * 1. take out of active list at multicast disable time.
1024  * 2. take out of active list only when last pending Tx completes.
1025  * For the MCE, we can
1026  * 3. take out of active list at multicast disable time.
1027  * 4. take out of active list only when last pending Tx completes.
1028  * 5. move from active list to stale list at multicast disable time.
1029  * We choose to use 2,4. We use option 4 so that if a multicast enable
1030  * is tried before the pending Tx completes, the enable code finds the
1031  * mce in the active list and just has to make sure it will not be reaped
1032  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1033  * a stale list (#5) that would be checked in the enable code would need
1034  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1035  * after the multicast disable would try to put an AH in the active list,
1036  * and associate the mce it finds in the active list to this new AH,
1037  * whereas the mce is already associated with the previous AH (taken off
1038  * the active list), and will be removed once the pending Tx's complete
1039  * (unless a reference count on mce's is implemented). One implication of
1040  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1041  * grab new references on the AH, further delaying the leave.
1042  *
1043  * In the case of mcg delete (or create) trap when the port is sendonly
1044  * joined, the AH and MCE handling is different: the AH and MCE has to be
1045  * immediately taken off the active lists (forcing a join and path lookup
1046  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1047  * to an mcg as it is repeatedly created and deleted and goes thru
1048  * reincarnations).
1049  *
1050  * When a port is already sendonly joined, and a multicast enable is
1051  * attempted, the same mce structure is promoted; this ensures only a
1052  * single mce on the active list tracks the most powerful join state.
1053  *
1054  * In the case of port up event handling, the MCE for sendonly membership
1055  * is freed up, and the ACE is put into the free list as soon as possible
1056  * (depending on whether posted Tx's have completed). For fullmembership
1057  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1058  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1059  * done; else the mce is deconstructed (mc_fullreap case).
1060  *
1061  * MCG creation and deletion trap handling:
1062  *
1063  * These traps are unreliable (meaning sometimes the trap might never
1064  * be delivered to the subscribed nodes) and may arrive out-of-order
1065  * since they use UD transport. An alternative to relying on these
1066  * unreliable traps is to poll for mcg presence every so often, but
1067  * instead of doing that, we try to be as conservative as possible
1068  * while handling the traps, and hope that the traps do arrive at
1069  * the subscribed nodes soon. Note that if a node is fullmember
1070  * joined to an mcg, it can not possibly receive a mcg create/delete
1071  * trap for that mcg (by fullmember definition); if it does, it is
1072  * an old trap from a previous incarnation of the mcg.
1073  *
1074  * Whenever a trap is received, the driver cleans up its sendonly
1075  * membership to the group; we choose to do a sendonly leave even
1076  * on a creation trap to handle the case of a prior deletion of the mcg
1077  * having gone unnoticed. Consider an example scenario:
1078  * T1: MCG M is deleted, and fires off deletion trap D1.
1079  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1080  * T3: Node N tries to transmit to M, joining in sendonly mode.
1081  * T4: MCG M is deleted, and fires off deletion trap D2.
1082  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1083  *     If the trap is D2, then a LEAVE is not required, since the mcg
1084  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1085  *     approach is to always LEAVE, but the SM may be confused if it
1086  *     receives a LEAVE without a prior JOIN.
1087  *
1088  * Management of the non-membership to an mcg is similar to the above,
1089  * except that if the interface is in promiscuous mode, it is required
1090  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1091  * if the re-join attempt fails (in which case a warning message needs
1092  * to be printed), it is not clear whether it failed due to the mcg not
1093  * existing, or some fabric/hca issues, due to the delayed nature of
1094  * trap delivery. Querying the SA to establish presence/absence of the
1095  * mcg is also racy at best. Thus, the driver just prints a warning
1096  * message when it can not rejoin after receiving a create trap, although
1097  * this might be (on rare occassions) a mis-warning if the create trap is
1098  * received after the mcg was deleted.
1099  */
1100 
1101 /*
1102  * Implementation of atomic "recycle" bits and reference count
1103  * on address handles. This utilizes the fact that max reference
1104  * count on any handle is limited by number of send wqes, thus
1105  * high bits in the ac_ref field can be used as the recycle bits,
1106  * and only the low bits hold the number of pending Tx requests.
1107  * This atomic AH reference counting allows the Tx completion
1108  * handler not to acquire the id_ac_mutex to process every completion,
1109  * thus reducing lock contention problems between completion and
1110  * the Tx path.
1111  */
1112 #define	CYCLEVAL		0x80000
1113 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1114 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1115 #define	GET_REF(ace)		((ace)->ac_ref)
1116 #define	GET_REF_CYCLE(ace) (				\
1117 	/*						\
1118 	 * Make sure "cycle" bit is set.		\
1119 	 */						\
1120 	ASSERT(CYCLE_SET(ace)),				\
1121 	((ace)->ac_ref & ~(CYCLEVAL))			\
1122 )
1123 #define	INC_REF(ace, num) {				\
1124 	atomic_add_32(&(ace)->ac_ref, num);		\
1125 }
1126 #define	SET_CYCLE_IF_REF(ace) (				\
1127 	CYCLE_SET(ace) ? B_TRUE :			\
1128 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1129 		CYCLEVAL ?				\
1130 		/*					\
1131 		 * Clear the "cycle" bit we just set;	\
1132 		 * ref count known to be 0 from above.	\
1133 		 */					\
1134 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1135 		/*					\
1136 		 * We set "cycle" bit; let caller know.	\
1137 		 */					\
1138 		B_TRUE					\
1139 )
1140 #define	DEC_REF_DO_CYCLE(ace) (				\
1141 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1142 	    CYCLEVAL ?					\
1143 		/*					\
1144 		 * Ref count known to be 0 from above.	\
1145 		 */					\
1146 		B_TRUE :				\
1147 		B_FALSE					\
1148 )
1149 
1150 static void *
1151 list_get_head(list_t *list)
1152 {
1153 	list_node_t *lhead = list_head(list);
1154 
1155 	if (lhead != NULL)
1156 		list_remove(list, lhead);
1157 	return (lhead);
1158 }
1159 
1160 /*
1161  * This is always guaranteed to be able to queue the work.
1162  */
1163 static void
1164 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1165 {
1166 	/* Initialize request */
1167 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1168 	ptr->rq_op = op;
1169 
1170 	/*
1171 	 * Queue provided slot onto request pool.
1172 	 */
1173 	mutex_enter(&state->id_acache_req_lock);
1174 	list_insert_tail(&state->id_req_list, ptr);
1175 
1176 	/* Go, fetch, async thread */
1177 	cv_signal(&state->id_acache_req_cv);
1178 	mutex_exit(&state->id_acache_req_lock);
1179 }
1180 
1181 /*
1182  * Main body of the per interface async thread.
1183  */
1184 static void
1185 ibd_async_work(ibd_state_t *state)
1186 {
1187 	ibd_req_t *ptr;
1188 	callb_cpr_t cprinfo;
1189 
1190 	mutex_enter(&state->id_acache_req_lock);
1191 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1192 	    callb_generic_cpr, "ibd_async_work");
1193 
1194 	for (;;) {
1195 		ptr = list_get_head(&state->id_req_list);
1196 		if (ptr != NULL) {
1197 			mutex_exit(&state->id_acache_req_lock);
1198 
1199 			/*
1200 			 * Once we have done the operation, there is no
1201 			 * guarantee the request slot is going to be valid,
1202 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1203 			 * TRAP).
1204 			 *
1205 			 * Perform the request.
1206 			 */
1207 			switch (ptr->rq_op) {
1208 				case IBD_ASYNC_GETAH:
1209 					ibd_async_acache(state, &ptr->rq_mac);
1210 					break;
1211 				case IBD_ASYNC_JOIN:
1212 				case IBD_ASYNC_LEAVE:
1213 					ibd_async_multicast(state,
1214 					    ptr->rq_gid, ptr->rq_op);
1215 					break;
1216 				case IBD_ASYNC_PROMON:
1217 					ibd_async_setprom(state);
1218 					break;
1219 				case IBD_ASYNC_PROMOFF:
1220 					ibd_async_unsetprom(state);
1221 					break;
1222 				case IBD_ASYNC_REAP:
1223 					ibd_async_reap_group(state,
1224 					    ptr->rq_ptr, ptr->rq_gid,
1225 					    IB_MC_JSTATE_FULL);
1226 					/*
1227 					 * the req buf contains in mce
1228 					 * structure, so we do not need
1229 					 * to free it here.
1230 					 */
1231 					ptr = NULL;
1232 					break;
1233 				case IBD_ASYNC_TRAP:
1234 					ibd_async_trap(state, ptr);
1235 					break;
1236 				case IBD_ASYNC_SCHED:
1237 					ibd_async_txsched(state);
1238 					break;
1239 				case IBD_ASYNC_LINK:
1240 					ibd_async_link(state, ptr);
1241 					break;
1242 				case IBD_ASYNC_EXIT:
1243 					mutex_enter(&state->id_acache_req_lock);
1244 #ifndef __lock_lint
1245 					CALLB_CPR_EXIT(&cprinfo);
1246 #else
1247 					mutex_exit(&state->id_acache_req_lock);
1248 #endif
1249 					return;
1250 			}
1251 			if (ptr != NULL)
1252 				kmem_cache_free(state->id_req_kmc, ptr);
1253 
1254 			mutex_enter(&state->id_acache_req_lock);
1255 		} else {
1256 #ifndef __lock_lint
1257 			/*
1258 			 * Nothing to do: wait till new request arrives.
1259 			 */
1260 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1261 			cv_wait(&state->id_acache_req_cv,
1262 			    &state->id_acache_req_lock);
1263 			CALLB_CPR_SAFE_END(&cprinfo,
1264 			    &state->id_acache_req_lock);
1265 #endif
1266 		}
1267 	}
1268 
1269 	/*NOTREACHED*/
1270 	_NOTE(NOT_REACHED)
1271 }
1272 
1273 /*
1274  * Return when it is safe to queue requests to the async daemon; primarily
1275  * for subnet trap and async event handling. Disallow requests before the
1276  * daemon is created, and when interface deinitilization starts.
1277  */
1278 static boolean_t
1279 ibd_async_safe(ibd_state_t *state)
1280 {
1281 	mutex_enter(&state->id_trap_lock);
1282 	if (state->id_trap_stop) {
1283 		mutex_exit(&state->id_trap_lock);
1284 		return (B_FALSE);
1285 	}
1286 	state->id_trap_inprog++;
1287 	mutex_exit(&state->id_trap_lock);
1288 	return (B_TRUE);
1289 }
1290 
1291 /*
1292  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1293  * trap or event handling to complete to kill the async thread and deconstruct
1294  * the mcg/ace list.
1295  */
1296 static void
1297 ibd_async_done(ibd_state_t *state)
1298 {
1299 	mutex_enter(&state->id_trap_lock);
1300 	if (--state->id_trap_inprog == 0)
1301 		cv_signal(&state->id_trap_cv);
1302 	mutex_exit(&state->id_trap_lock);
1303 }
1304 
1305 /*
1306  * Hash functions:
1307  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1308  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1309  * These operate on mac addresses input into ibd_send, but there is no
1310  * guarantee on the alignment of the ipoib_mac_t structure.
1311  */
1312 /*ARGSUSED*/
1313 static uint_t
1314 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1315 {
1316 	ulong_t ptraddr = (ulong_t)key;
1317 	uint_t hval;
1318 
1319 	/*
1320 	 * If the input address is 4 byte aligned, we can just dereference
1321 	 * it. This is most common, since IP will send in a 4 byte aligned
1322 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1323 	 * 4 byte aligned too.
1324 	 */
1325 	if ((ptraddr & 3) == 0)
1326 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1327 
1328 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1329 	return (hval);
1330 }
1331 
1332 static int
1333 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1334 {
1335 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1336 		return (0);
1337 	else
1338 		return (1);
1339 }
1340 
1341 /*
1342  * Initialize all the per interface caches and lists; AH cache,
1343  * MCG list etc.
1344  */
1345 static int
1346 ibd_acache_init(ibd_state_t *state)
1347 {
1348 	ibd_ace_t *ce;
1349 	int i;
1350 
1351 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1352 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1353 
1354 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1355 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1356 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1357 	    offsetof(ibd_ace_t, ac_list));
1358 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1359 	    offsetof(ibd_ace_t, ac_list));
1360 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1361 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1362 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1363 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1364 	    offsetof(ibd_mce_t, mc_list));
1365 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1366 	    offsetof(ibd_mce_t, mc_list));
1367 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1368 	    offsetof(ibd_req_t, rq_list));
1369 
1370 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1371 	    IBD_NUM_AH, KM_SLEEP);
1372 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1373 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1374 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1375 			ibd_acache_fini(state);
1376 			return (DDI_FAILURE);
1377 		} else {
1378 			CLEAR_REFCYCLE(ce);
1379 			ce->ac_mce = NULL;
1380 			IBD_ACACHE_INSERT_FREE(state, ce);
1381 		}
1382 	}
1383 	return (DDI_SUCCESS);
1384 }
1385 
1386 static void
1387 ibd_acache_fini(ibd_state_t *state)
1388 {
1389 	ibd_ace_t *ptr;
1390 
1391 	mutex_enter(&state->id_ac_mutex);
1392 
1393 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1394 		ASSERT(GET_REF(ptr) == 0);
1395 		(void) ibt_free_ud_dest(ptr->ac_dest);
1396 	}
1397 
1398 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1399 		ASSERT(GET_REF(ptr) == 0);
1400 		(void) ibt_free_ud_dest(ptr->ac_dest);
1401 	}
1402 
1403 	list_destroy(&state->id_ah_free);
1404 	list_destroy(&state->id_ah_active);
1405 	list_destroy(&state->id_mc_full);
1406 	list_destroy(&state->id_mc_non);
1407 	list_destroy(&state->id_req_list);
1408 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1409 	mutex_exit(&state->id_ac_mutex);
1410 	mutex_destroy(&state->id_ac_mutex);
1411 	mutex_destroy(&state->id_mc_mutex);
1412 	mutex_destroy(&state->id_acache_req_lock);
1413 	cv_destroy(&state->id_acache_req_cv);
1414 }
1415 
1416 /*
1417  * Search AH active hash list for a cached path to input destination.
1418  * If we are "just looking", hold == F. When we are in the Tx path,
1419  * we set hold == T to grab a reference on the AH so that it can not
1420  * be recycled to a new destination while the Tx request is posted.
1421  */
1422 static ibd_ace_t *
1423 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1424 {
1425 	ibd_ace_t *ptr;
1426 
1427 	ASSERT(mutex_owned(&state->id_ac_mutex));
1428 
1429 	/*
1430 	 * Do hash search.
1431 	 */
1432 	if (mod_hash_find(state->id_ah_active_hash,
1433 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1434 		if (hold)
1435 			INC_REF(ptr, num);
1436 		return (ptr);
1437 	}
1438 	return (NULL);
1439 }
1440 
1441 /*
1442  * This is called by the tx side; if an initialized AH is found in
1443  * the active list, it is locked down and can be used; if no entry
1444  * is found, an async request is queued to do path resolution.
1445  */
1446 static ibd_ace_t *
1447 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1448 {
1449 	ibd_ace_t *ptr;
1450 	ibd_req_t *req;
1451 
1452 	/*
1453 	 * Only attempt to print when we can; in the mdt pattr case, the
1454 	 * address is not aligned properly.
1455 	 */
1456 	if (((ulong_t)mac & 3) == 0) {
1457 		DPRINT(4,
1458 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1459 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1460 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1461 		    htonl(mac->ipoib_gidsuff[1]));
1462 	}
1463 
1464 	mutex_enter(&state->id_ac_mutex);
1465 
1466 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1467 		mutex_exit(&state->id_ac_mutex);
1468 		return (ptr);
1469 	}
1470 
1471 	/*
1472 	 * Implementation of a single outstanding async request; if
1473 	 * the operation is not started yet, queue a request and move
1474 	 * to ongoing state. Remember in id_ah_addr for which address
1475 	 * we are queueing the request, in case we need to flag an error;
1476 	 * Any further requests, for the same or different address, until
1477 	 * the operation completes, is sent back to GLDv3 to be retried.
1478 	 * The async thread will update id_ah_op with an error indication
1479 	 * or will set it to indicate the next look up can start; either
1480 	 * way, it will mac_tx_update() so that all blocked requests come
1481 	 * back here.
1482 	 */
1483 	*err = EAGAIN;
1484 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1485 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1486 		if (req != NULL) {
1487 			/*
1488 			 * We did not even find the entry; queue a request
1489 			 * for it.
1490 			 */
1491 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1492 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1493 			state->id_ah_op = IBD_OP_ONGOING;
1494 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1495 		}
1496 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1497 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1498 		/*
1499 		 * Check the status of the pathrecord lookup request
1500 		 * we had queued before.
1501 		 */
1502 		if (state->id_ah_op == IBD_OP_ERRORED) {
1503 			*err = EFAULT;
1504 			state->id_ah_error++;
1505 		} else {
1506 			/*
1507 			 * IBD_OP_ROUTERED case: We need to send to the
1508 			 * all-router MCG. If we can find the AH for
1509 			 * the mcg, the Tx will be attempted. If we
1510 			 * do not find the AH, we return NORESOURCES
1511 			 * to retry.
1512 			 */
1513 			ipoib_mac_t routermac;
1514 
1515 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1516 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1517 			    numwqe);
1518 		}
1519 		state->id_ah_op = IBD_OP_NOTSTARTED;
1520 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1521 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1522 		/*
1523 		 * This case can happen when we get a higher band
1524 		 * packet. The easiest way is to reset the state machine
1525 		 * to accommodate the higher priority packet.
1526 		 */
1527 		state->id_ah_op = IBD_OP_NOTSTARTED;
1528 	}
1529 	mutex_exit(&state->id_ac_mutex);
1530 
1531 	return (ptr);
1532 }
1533 
1534 /*
1535  * Grab a not-currently-in-use AH/PathRecord from the active
1536  * list to recycle to a new destination. Only the async thread
1537  * executes this code.
1538  */
1539 static ibd_ace_t *
1540 ibd_acache_get_unref(ibd_state_t *state)
1541 {
1542 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1543 
1544 	ASSERT(mutex_owned(&state->id_ac_mutex));
1545 
1546 	/*
1547 	 * Do plain linear search.
1548 	 */
1549 	while (ptr != NULL) {
1550 		/*
1551 		 * Note that it is possible that the "cycle" bit
1552 		 * is set on the AH w/o any reference count. The
1553 		 * mcg must have been deleted, and the tx cleanup
1554 		 * just decremented the reference count to 0, but
1555 		 * hasn't gotten around to grabbing the id_ac_mutex
1556 		 * to move the AH into the free list.
1557 		 */
1558 		if (GET_REF(ptr) == 0) {
1559 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1560 			break;
1561 		}
1562 		ptr = list_next(&state->id_ah_active, ptr);
1563 	}
1564 	return (ptr);
1565 }
1566 
1567 /*
1568  * Invoked to clean up AH from active list in case of multicast
1569  * disable and to handle sendonly memberships during mcg traps.
1570  * And for port up processing for multicast and unicast AHs.
1571  * Normally, the AH is taken off the active list, and put into
1572  * the free list to be recycled for a new destination. In case
1573  * Tx requests on the AH have not completed yet, the AH is marked
1574  * for reaping (which will put the AH on the free list) once the Tx's
1575  * complete; in this case, depending on the "force" input, we take
1576  * out the AH from the active list right now, or leave it also for
1577  * the reap operation. Returns TRUE if the AH is taken off the active
1578  * list (and either put into the free list right now, or arranged for
1579  * later), FALSE otherwise.
1580  */
1581 static boolean_t
1582 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1583 {
1584 	ibd_ace_t *acactive;
1585 	boolean_t ret = B_TRUE;
1586 
1587 	ASSERT(mutex_owned(&state->id_ac_mutex));
1588 
1589 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1590 
1591 		/*
1592 		 * Note that the AH might already have the cycle bit set
1593 		 * on it; this might happen if sequences of multicast
1594 		 * enables and disables are coming so fast, that posted
1595 		 * Tx's to the mcg have not completed yet, and the cycle
1596 		 * bit is set successively by each multicast disable.
1597 		 */
1598 		if (SET_CYCLE_IF_REF(acactive)) {
1599 			if (!force) {
1600 				/*
1601 				 * The ace is kept on the active list, further
1602 				 * Tx's can still grab a reference on it; the
1603 				 * ace is reaped when all pending Tx's
1604 				 * referencing the AH complete.
1605 				 */
1606 				ret = B_FALSE;
1607 			} else {
1608 				/*
1609 				 * In the mcg trap case, we always pull the
1610 				 * AH from the active list. And also the port
1611 				 * up multi/unicast case.
1612 				 */
1613 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1614 				acactive->ac_mce = NULL;
1615 			}
1616 		} else {
1617 			/*
1618 			 * Determined the ref count is 0, thus reclaim
1619 			 * immediately after pulling out the ace from
1620 			 * the active list.
1621 			 */
1622 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1623 			acactive->ac_mce = NULL;
1624 			IBD_ACACHE_INSERT_FREE(state, acactive);
1625 		}
1626 
1627 	}
1628 	return (ret);
1629 }
1630 
1631 /*
1632  * Helper function for async path record lookup. If we are trying to
1633  * Tx to a MCG, check our membership, possibly trying to join the
1634  * group if required. If that fails, try to send the packet to the
1635  * all router group (indicated by the redirect output), pointing
1636  * the input mac address to the router mcg address.
1637  */
1638 static ibd_mce_t *
1639 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1640 {
1641 	ib_gid_t mgid;
1642 	ibd_mce_t *mce;
1643 	ipoib_mac_t routermac;
1644 
1645 	*redirect = B_FALSE;
1646 	ibd_n2h_gid(mac, &mgid);
1647 
1648 	/*
1649 	 * Check the FullMember+SendOnlyNonMember list.
1650 	 * Since we are the only one who manipulates the
1651 	 * id_mc_full list, no locks are needed.
1652 	 */
1653 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1654 	if (mce != NULL) {
1655 		DPRINT(4, "ibd_async_mcache : already joined to group");
1656 		return (mce);
1657 	}
1658 
1659 	/*
1660 	 * Not found; try to join(SendOnlyNonMember) and attach.
1661 	 */
1662 	DPRINT(4, "ibd_async_mcache : not joined to group");
1663 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1664 	    NULL) {
1665 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1666 		return (mce);
1667 	}
1668 
1669 	/*
1670 	 * MCGroup not present; try to join the all-router group. If
1671 	 * any of the following steps succeed, we will be redirecting
1672 	 * to the all router group.
1673 	 */
1674 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1675 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1676 		return (NULL);
1677 	*redirect = B_TRUE;
1678 	ibd_n2h_gid(&routermac, &mgid);
1679 	bcopy(&routermac, mac, IPOIB_ADDRL);
1680 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1681 	    mgid.gid_prefix, mgid.gid_guid);
1682 
1683 	/*
1684 	 * Are we already joined to the router group?
1685 	 */
1686 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1687 		DPRINT(4, "ibd_async_mcache : using already joined router"
1688 		    "group\n");
1689 		return (mce);
1690 	}
1691 
1692 	/*
1693 	 * Can we join(SendOnlyNonMember) the router group?
1694 	 */
1695 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1696 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1697 	    NULL) {
1698 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1699 		return (mce);
1700 	}
1701 
1702 	return (NULL);
1703 }
1704 
1705 /*
1706  * Async path record lookup code.
1707  */
1708 static void
1709 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1710 {
1711 	ibd_ace_t *ce;
1712 	ibd_mce_t *mce = NULL;
1713 	ibt_path_attr_t path_attr;
1714 	ibt_path_info_t path_info;
1715 	ib_gid_t destgid;
1716 	char ret = IBD_OP_NOTSTARTED;
1717 
1718 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1719 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1720 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1721 	    htonl(mac->ipoib_gidsuff[1]));
1722 
1723 	/*
1724 	 * Check whether we are trying to transmit to a MCG.
1725 	 * In that case, we need to make sure we are a member of
1726 	 * the MCG.
1727 	 */
1728 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1729 		boolean_t redirected;
1730 
1731 		/*
1732 		 * If we can not find or join the group or even
1733 		 * redirect, error out.
1734 		 */
1735 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1736 		    NULL) {
1737 			state->id_ah_op = IBD_OP_ERRORED;
1738 			return;
1739 		}
1740 
1741 		/*
1742 		 * If we got redirected, we need to determine whether
1743 		 * the AH for the new mcg is in the cache already, and
1744 		 * not pull it in then; otherwise proceed to get the
1745 		 * path for the new mcg. There is no guarantee that
1746 		 * if the AH is currently in the cache, it will still be
1747 		 * there when we look in ibd_acache_lookup(), but that's
1748 		 * okay, we will come back here.
1749 		 */
1750 		if (redirected) {
1751 			ret = IBD_OP_ROUTERED;
1752 			DPRINT(4, "ibd_async_acache :  redirected to "
1753 			    "%08X:%08X:%08X:%08X:%08X",
1754 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1755 			    htonl(mac->ipoib_gidpref[1]),
1756 			    htonl(mac->ipoib_gidsuff[0]),
1757 			    htonl(mac->ipoib_gidsuff[1]));
1758 
1759 			mutex_enter(&state->id_ac_mutex);
1760 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1761 				state->id_ah_op = IBD_OP_ROUTERED;
1762 				mutex_exit(&state->id_ac_mutex);
1763 				DPRINT(4, "ibd_async_acache : router AH found");
1764 				return;
1765 			}
1766 			mutex_exit(&state->id_ac_mutex);
1767 		}
1768 	}
1769 
1770 	/*
1771 	 * Get an AH from the free list.
1772 	 */
1773 	mutex_enter(&state->id_ac_mutex);
1774 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1775 		/*
1776 		 * No free ones; try to grab an unreferenced active
1777 		 * one. Maybe we need to make the active list LRU,
1778 		 * but that will create more work for Tx callbacks.
1779 		 * Is there a way of not having to pull out the
1780 		 * entry from the active list, but just indicate it
1781 		 * is being recycled? Yes, but that creates one more
1782 		 * check in the fast lookup path.
1783 		 */
1784 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1785 			/*
1786 			 * Pretty serious shortage now.
1787 			 */
1788 			state->id_ah_op = IBD_OP_NOTSTARTED;
1789 			mutex_exit(&state->id_ac_mutex);
1790 			DPRINT(10, "ibd_async_acache : failed to find AH "
1791 			    "slot\n");
1792 			return;
1793 		}
1794 		/*
1795 		 * We could check whether ac_mce points to a SendOnly
1796 		 * member and drop that membership now. Or do it lazily
1797 		 * at detach time.
1798 		 */
1799 		ce->ac_mce = NULL;
1800 	}
1801 	mutex_exit(&state->id_ac_mutex);
1802 	ASSERT(ce->ac_mce == NULL);
1803 
1804 	/*
1805 	 * Update the entry.
1806 	 */
1807 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1808 
1809 	bzero(&path_info, sizeof (path_info));
1810 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1811 	path_attr.pa_sgid = state->id_sgid;
1812 	path_attr.pa_num_dgids = 1;
1813 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1814 	path_attr.pa_dgids = &destgid;
1815 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1816 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1817 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1818 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1819 		goto error;
1820 	}
1821 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1822 	    ntohl(ce->ac_mac.ipoib_qpn),
1823 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1824 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1825 		goto error;
1826 	}
1827 
1828 	/*
1829 	 * mce is set whenever an AH is being associated with a
1830 	 * MCG; this will come in handy when we leave the MCG. The
1831 	 * lock protects Tx fastpath from scanning the active list.
1832 	 */
1833 	if (mce != NULL)
1834 		ce->ac_mce = mce;
1835 	mutex_enter(&state->id_ac_mutex);
1836 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1837 	state->id_ah_op = ret;
1838 	mutex_exit(&state->id_ac_mutex);
1839 	return;
1840 error:
1841 	/*
1842 	 * We might want to drop SendOnly membership here if we
1843 	 * joined above. The lock protects Tx callbacks inserting
1844 	 * into the free list.
1845 	 */
1846 	mutex_enter(&state->id_ac_mutex);
1847 	state->id_ah_op = IBD_OP_ERRORED;
1848 	IBD_ACACHE_INSERT_FREE(state, ce);
1849 	mutex_exit(&state->id_ac_mutex);
1850 }
1851 
1852 /*
1853  * While restoring port's presence on the subnet on a port up, it is possible
1854  * that the port goes down again.
1855  */
1856 static void
1857 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1858 {
1859 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1860 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1861 	    LINK_STATE_UP;
1862 	ibd_mce_t *mce, *pmce;
1863 	ibd_ace_t *ace, *pace;
1864 
1865 	DPRINT(10, "ibd_async_link(): %d", opcode);
1866 
1867 	/*
1868 	 * On a link up, revalidate the link speed/width. No point doing
1869 	 * this on a link down, since we will be unable to do SA operations,
1870 	 * defaulting to the lowest speed. Also notice that we update our
1871 	 * notion of speed before calling mac_link_update(), which will do
1872 	 * neccesary higher level notifications for speed changes.
1873 	 */
1874 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1875 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1876 		state->id_link_speed = ibd_get_portspeed(state);
1877 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1878 	}
1879 
1880 	/*
1881 	 * Do all the work required to establish our presence on
1882 	 * the subnet.
1883 	 */
1884 	if (opcode == IBD_LINK_UP_ABSENT) {
1885 		/*
1886 		 * If in promiscuous mode ...
1887 		 */
1888 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1889 			/*
1890 			 * Drop all nonmembership.
1891 			 */
1892 			ibd_async_unsetprom(state);
1893 
1894 			/*
1895 			 * Then, try to regain nonmembership to all mcg's.
1896 			 */
1897 			ibd_async_setprom(state);
1898 
1899 		}
1900 
1901 		/*
1902 		 * Drop all sendonly membership (which also gets rid of the
1903 		 * AHs); try to reacquire all full membership.
1904 		 */
1905 		mce = list_head(&state->id_mc_full);
1906 		while ((pmce = mce) != NULL) {
1907 			mce = list_next(&state->id_mc_full, mce);
1908 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1909 				ibd_leave_group(state,
1910 				    pmce->mc_info.mc_adds_vect.av_dgid,
1911 				    IB_MC_JSTATE_SEND_ONLY_NON);
1912 			else
1913 				ibd_reacquire_group(state, pmce);
1914 		}
1915 
1916 		/*
1917 		 * Recycle all active AHs to free list (and if there are
1918 		 * pending posts, make sure they will go into the free list
1919 		 * once the Tx's complete). Grab the lock to prevent
1920 		 * concurrent Tx's as well as Tx cleanups.
1921 		 */
1922 		mutex_enter(&state->id_ac_mutex);
1923 		ace = list_head(&state->id_ah_active);
1924 		while ((pace = ace) != NULL) {
1925 			boolean_t cycled;
1926 
1927 			ace = list_next(&state->id_ah_active, ace);
1928 			mce = pace->ac_mce;
1929 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1930 			    B_TRUE);
1931 			/*
1932 			 * If this is for an mcg, it must be for a fullmember,
1933 			 * since we got rid of send-only members above when
1934 			 * processing the mce list.
1935 			 */
1936 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1937 			    IB_MC_JSTATE_FULL)));
1938 
1939 			/*
1940 			 * Check if the fullmember mce needs to be torn down,
1941 			 * ie whether the DLPI disable has already been done.
1942 			 * If so, do some of the work of tx_cleanup, namely
1943 			 * causing leave (which will fail), detach and
1944 			 * mce-freeing. tx_cleanup will put the AH into free
1945 			 * list. The reason to duplicate some of this
1946 			 * tx_cleanup work is because we want to delete the
1947 			 * AH right now instead of waiting for tx_cleanup, to
1948 			 * force subsequent Tx's to reacquire an AH.
1949 			 */
1950 			if ((mce != NULL) && (mce->mc_fullreap))
1951 				ibd_async_reap_group(state, mce,
1952 				    mce->mc_info.mc_adds_vect.av_dgid,
1953 				    mce->mc_jstate);
1954 		}
1955 		mutex_exit(&state->id_ac_mutex);
1956 	}
1957 
1958 	/*
1959 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1960 	 * (which stops further events from being delivered) before
1961 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1962 	 * has already been done.
1963 	 */
1964 	mutex_enter(&state->id_link_mutex);
1965 	state->id_link_state = lstate;
1966 	mac_link_update(state->id_mh, lstate);
1967 	mutex_exit(&state->id_link_mutex);
1968 
1969 	ibd_async_done(state);
1970 }
1971 
1972 /*
1973  * Check the pkey table to see if we can find the pkey we're looking for.
1974  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1975  * failure.
1976  */
1977 static int
1978 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1979     uint16_t *pkix)
1980 {
1981 	uint16_t ndx;
1982 
1983 	ASSERT(pkix != NULL);
1984 
1985 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1986 		if (pkey_tbl[ndx] == pkey) {
1987 			*pkix = ndx;
1988 			return (0);
1989 		}
1990 	}
1991 	return (-1);
1992 }
1993 
1994 /*
1995  * When the link is notified up, we need to do a few things, based
1996  * on the port's current p_init_type_reply claiming a reinit has been
1997  * done or not. The reinit steps are:
1998  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1999  *    the old Pkey and GID0 are correct.
2000  * 2. Register for mcg traps (already done by ibmf).
2001  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2002  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2003  * 4. Give up all sendonly memberships.
2004  * 5. Acquire all full memberships.
2005  * 6. In promiscuous mode, acquire all non memberships.
2006  * 7. Recycle all AHs to free list.
2007  */
2008 static void
2009 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2010 {
2011 	ibt_hca_portinfo_t *port_infop = NULL;
2012 	ibt_status_t ibt_status;
2013 	uint_t psize, port_infosz;
2014 	ibd_link_op_t opcode;
2015 	ibd_req_t *req;
2016 	link_state_t new_link_state = LINK_STATE_UP;
2017 	uint8_t itreply;
2018 	uint16_t pkix;
2019 	int ret;
2020 
2021 	/*
2022 	 * Let's not race with a plumb or an unplumb; if we detect a
2023 	 * pkey relocation event later on here, we may have to restart.
2024 	 */
2025 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2026 
2027 	mutex_enter(&state->id_link_mutex);
2028 
2029 	/*
2030 	 * If the init code in ibd_m_start hasn't yet set up the
2031 	 * pkey/gid, nothing to do; that code will set the link state.
2032 	 */
2033 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2034 		mutex_exit(&state->id_link_mutex);
2035 		goto link_mod_return;
2036 	}
2037 
2038 	/*
2039 	 * If this routine was called in response to a port down event,
2040 	 * we just need to see if this should be informed.
2041 	 */
2042 	if (code == IBT_ERROR_PORT_DOWN) {
2043 		new_link_state = LINK_STATE_DOWN;
2044 		goto update_link_state;
2045 	}
2046 
2047 	/*
2048 	 * If it's not a port down event we've received, try to get the port
2049 	 * attributes first. If we fail here, the port is as good as down.
2050 	 * Otherwise, if the link went down by the time the handler gets
2051 	 * here, give up - we cannot even validate the pkey/gid since those
2052 	 * are not valid and this is as bad as a port down anyway.
2053 	 */
2054 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2055 	    &port_infop, &psize, &port_infosz);
2056 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2057 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2058 		new_link_state = LINK_STATE_DOWN;
2059 		goto update_link_state;
2060 	}
2061 
2062 	/*
2063 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2064 	 * PreserveContentReply are 0, we don't know anything about the
2065 	 * data loaded into the port attributes, so we need to verify
2066 	 * if gid0 and pkey are still valid.
2067 	 */
2068 	itreply = port_infop->p_init_type_reply;
2069 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2070 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2071 		/*
2072 		 * Check to see if the subnet part of GID0 has changed. If
2073 		 * not, check the simple case first to see if the pkey
2074 		 * index is the same as before; finally check to see if the
2075 		 * pkey has been relocated to a different index in the table.
2076 		 */
2077 		if (bcmp(port_infop->p_sgid_tbl,
2078 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2079 
2080 			new_link_state = LINK_STATE_DOWN;
2081 
2082 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2083 		    state->id_pkey) {
2084 
2085 			new_link_state = LINK_STATE_UP;
2086 
2087 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2088 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2089 
2090 			ibt_free_portinfo(port_infop, port_infosz);
2091 			mutex_exit(&state->id_link_mutex);
2092 
2093 			/*
2094 			 * Currently a restart is required if our pkey has moved
2095 			 * in the pkey table. If we get the ibt_recycle_ud() to
2096 			 * work as documented (expected), we may be able to
2097 			 * avoid a complete restart.  Note that we've already
2098 			 * marked both the start and stop 'in-progress' flags,
2099 			 * so it is ok to go ahead and do this restart.
2100 			 */
2101 			ibd_undo_start(state, LINK_STATE_DOWN);
2102 			if ((ret = ibd_start(state)) != 0) {
2103 				DPRINT(10, "ibd_restart: cannot restart, "
2104 				    "ret=%d", ret);
2105 			}
2106 
2107 			goto link_mod_return;
2108 		} else {
2109 			new_link_state = LINK_STATE_DOWN;
2110 		}
2111 	}
2112 
2113 update_link_state:
2114 	if (port_infop) {
2115 		ibt_free_portinfo(port_infop, port_infosz);
2116 	}
2117 
2118 	/*
2119 	 * If the old state is the same as the new state, nothing to do
2120 	 */
2121 	if (state->id_link_state == new_link_state) {
2122 		mutex_exit(&state->id_link_mutex);
2123 		goto link_mod_return;
2124 	}
2125 
2126 	/*
2127 	 * Ok, so there was a link state change; see if it's safe to ask
2128 	 * the async thread to do the work
2129 	 */
2130 	if (!ibd_async_safe(state)) {
2131 		state->id_link_state = new_link_state;
2132 		mutex_exit(&state->id_link_mutex);
2133 		goto link_mod_return;
2134 	}
2135 
2136 	mutex_exit(&state->id_link_mutex);
2137 
2138 	/*
2139 	 * If we're reporting a link up, check InitTypeReply to see if
2140 	 * the SM has ensured that the port's presence in mcg, traps,
2141 	 * etc. is intact.
2142 	 */
2143 	if (new_link_state == LINK_STATE_DOWN) {
2144 		opcode = IBD_LINK_DOWN;
2145 	} else {
2146 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2147 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2148 			opcode = IBD_LINK_UP;
2149 		} else {
2150 			opcode = IBD_LINK_UP_ABSENT;
2151 		}
2152 	}
2153 
2154 	/*
2155 	 * Queue up a request for ibd_async_link() to handle this link
2156 	 * state change event
2157 	 */
2158 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2159 	req->rq_ptr = (void *)opcode;
2160 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2161 
2162 link_mod_return:
2163 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2164 }
2165 
2166 /*
2167  * For the port up/down events, IBTL guarantees there will not be concurrent
2168  * invocations of the handler. IBTL might coalesce link transition events,
2169  * and not invoke the handler for _each_ up/down transition, but it will
2170  * invoke the handler with last known state
2171  */
2172 static void
2173 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2174     ibt_async_code_t code, ibt_async_event_t *event)
2175 {
2176 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2177 
2178 	switch (code) {
2179 	case IBT_ERROR_CATASTROPHIC_CHAN:
2180 		ibd_print_warn(state, "catastrophic channel error");
2181 		break;
2182 	case IBT_ERROR_CQ:
2183 		ibd_print_warn(state, "completion queue error");
2184 		break;
2185 	case IBT_PORT_CHANGE_EVENT:
2186 		/*
2187 		 * Events will be delivered to all instances that have
2188 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2189 		 * Only need to do work for our port; IBTF will deliver
2190 		 * events for other ports on the hca we have ibt_open_hca'ed
2191 		 * too. Note that id_port is initialized in ibd_attach()
2192 		 * before we do an ibt_open_hca() in ibd_attach().
2193 		 */
2194 		ASSERT(state->id_hca_hdl == hca_hdl);
2195 		if (state->id_port != event->ev_port)
2196 			break;
2197 
2198 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2199 		    IBT_PORT_CHANGE_PKEY) {
2200 			ibd_link_mod(state, code);
2201 		}
2202 		break;
2203 	case IBT_ERROR_PORT_DOWN:
2204 	case IBT_CLNT_REREG_EVENT:
2205 	case IBT_EVENT_PORT_UP:
2206 		/*
2207 		 * Events will be delivered to all instances that have
2208 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2209 		 * Only need to do work for our port; IBTF will deliver
2210 		 * events for other ports on the hca we have ibt_open_hca'ed
2211 		 * too. Note that id_port is initialized in ibd_attach()
2212 		 * before we do an ibt_open_hca() in ibd_attach().
2213 		 */
2214 		ASSERT(state->id_hca_hdl == hca_hdl);
2215 		if (state->id_port != event->ev_port)
2216 			break;
2217 
2218 		ibd_link_mod(state, code);
2219 		break;
2220 
2221 	case IBT_HCA_ATTACH_EVENT:
2222 	case IBT_HCA_DETACH_EVENT:
2223 		/*
2224 		 * When a new card is plugged to the system, attach_event is
2225 		 * invoked. Additionally, a cfgadm needs to be run to make the
2226 		 * card known to the system, and an ifconfig needs to be run to
2227 		 * plumb up any ibd interfaces on the card. In the case of card
2228 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2229 		 * unplumb the ibd interfaces on the card; when the card is
2230 		 * actually unplugged, the detach_event is invoked;
2231 		 * additionally, if any ibd instances are still active on the
2232 		 * card (eg there were no associated RCM scripts), driver's
2233 		 * detach routine is invoked.
2234 		 */
2235 		break;
2236 	default:
2237 		break;
2238 	}
2239 }
2240 
2241 static int
2242 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2243 {
2244 	mac_register_t *macp;
2245 	int ret;
2246 
2247 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2248 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2249 		return (DDI_FAILURE);
2250 	}
2251 
2252 	/*
2253 	 * Note that when we register with mac during attach, we don't
2254 	 * have the id_macaddr yet, so we'll simply be registering a
2255 	 * zero macaddr that we'll overwrite later during plumb (in
2256 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2257 	 * update the mac layer with the correct mtu during plumb.
2258 	 */
2259 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2260 	macp->m_driver = state;
2261 	macp->m_dip = dip;
2262 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2263 	macp->m_callbacks = &ibd_m_callbacks;
2264 	macp->m_min_sdu = 0;
2265 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2266 
2267 	/*
2268 	 *  Register ourselves with the GLDv3 interface
2269 	 */
2270 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2271 		mac_free(macp);
2272 		DPRINT(10,
2273 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2274 		return (DDI_FAILURE);
2275 	}
2276 
2277 	mac_free(macp);
2278 	return (DDI_SUCCESS);
2279 }
2280 
2281 static int
2282 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2283 {
2284 	ibt_hca_attr_t hca_attrs;
2285 	ibt_status_t ibt_status;
2286 
2287 	/*
2288 	 * Query the HCA and fetch its attributes
2289 	 */
2290 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2291 	ASSERT(ibt_status == IBT_SUCCESS);
2292 
2293 	/*
2294 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2295 	 *    full checksum offload.
2296 	 */
2297 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2298 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2299 	}
2300 
2301 	/*
2302 	 * 2. Set LSO policy, capability and maximum length
2303 	 */
2304 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2305 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2306 		state->id_lso_policy = B_TRUE;
2307 	} else {
2308 		state->id_lso_policy = B_FALSE;
2309 	}
2310 
2311 	if (hca_attrs.hca_max_lso_size > 0) {
2312 		state->id_lso_capable = B_TRUE;
2313 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2314 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2315 		else
2316 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2317 	} else {
2318 		state->id_lso_capable = B_FALSE;
2319 		state->id_lso_maxlen = 0;
2320 	}
2321 
2322 	/*
2323 	 * 3. Set Reserved L_Key capability
2324 	 */
2325 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2326 		state->id_hca_res_lkey_capab = 1;
2327 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2328 	}
2329 
2330 	/*
2331 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2332 	 *    size information is provided by the hca
2333 	 */
2334 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2335 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2336 	} else {
2337 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2338 	}
2339 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2340 		state->id_max_sqseg = IBD_MAX_SQSEG;
2341 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2342 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2343 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2344 	}
2345 
2346 	/*
2347 	 * 5. Set number of recv and send wqes after checking hca maximum
2348 	 *    channel size
2349 	 */
2350 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2351 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2352 	} else {
2353 		state->id_num_rwqe = IBD_NUM_RWQE;
2354 	}
2355 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2356 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2357 	} else {
2358 		state->id_num_swqe = IBD_NUM_SWQE;
2359 	}
2360 
2361 	return (DDI_SUCCESS);
2362 }
2363 
2364 static int
2365 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2366 {
2367 	int instance;
2368 	uint32_t progress = state->id_mac_state;
2369 	ibt_status_t ret;
2370 
2371 	if (progress & IBD_DRV_MAC_REGISTERED) {
2372 		(void) mac_unregister(state->id_mh);
2373 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2374 	}
2375 
2376 	if (progress & IBD_DRV_PD_ALLOCD) {
2377 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2378 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2379 			ibd_print_warn(state, "failed to free "
2380 			    "protection domain, ret=%d", ret);
2381 		}
2382 		state->id_pd_hdl = NULL;
2383 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2384 	}
2385 
2386 	if (progress & IBD_DRV_HCA_OPENED) {
2387 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2388 		    IBT_SUCCESS) {
2389 			ibd_print_warn(state, "failed to close "
2390 			    "HCA device, ret=%d", ret);
2391 		}
2392 		state->id_hca_hdl = NULL;
2393 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2394 	}
2395 
2396 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2397 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2398 			ibd_print_warn(state,
2399 			    "ibt_detach() failed, ret=%d", ret);
2400 		}
2401 		state->id_ibt_hdl = NULL;
2402 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2403 	}
2404 
2405 	if (progress & IBD_DRV_TXINTR_ADDED) {
2406 		ddi_remove_softintr(state->id_tx);
2407 		state->id_tx = NULL;
2408 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2409 	}
2410 
2411 	if (progress & IBD_DRV_RXINTR_ADDED) {
2412 		ddi_remove_softintr(state->id_rx);
2413 		state->id_rx = NULL;
2414 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2415 	}
2416 
2417 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2418 		ibd_state_fini(state);
2419 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2420 	}
2421 
2422 	instance = ddi_get_instance(dip);
2423 	ddi_soft_state_free(ibd_list, instance);
2424 
2425 	return (DDI_SUCCESS);
2426 }
2427 
2428 /*
2429  * Attach device to the IO framework.
2430  */
2431 static int
2432 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2433 {
2434 	ibd_state_t *state = NULL;
2435 	ib_guid_t hca_guid;
2436 	int instance;
2437 	ibt_status_t ret;
2438 	int rv;
2439 
2440 	/*
2441 	 * IBD doesn't support suspend/resume
2442 	 */
2443 	if (cmd != DDI_ATTACH)
2444 		return (DDI_FAILURE);
2445 
2446 	/*
2447 	 * Allocate softstate structure
2448 	 */
2449 	instance = ddi_get_instance(dip);
2450 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2451 		return (DDI_FAILURE);
2452 	state = ddi_get_soft_state(ibd_list, instance);
2453 
2454 	/*
2455 	 * Initialize mutexes and condition variables
2456 	 */
2457 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2458 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2459 		goto attach_fail;
2460 	}
2461 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2462 
2463 	/*
2464 	 * Allocate rx,tx softintr
2465 	 */
2466 	if (ibd_rx_softintr == 1) {
2467 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2468 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2469 			DPRINT(10, "ibd_attach: failed in "
2470 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2471 			goto attach_fail;
2472 		}
2473 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2474 	}
2475 	if (ibd_tx_softintr == 1) {
2476 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2477 		    NULL, NULL, ibd_tx_recycle,
2478 		    (caddr_t)state)) != DDI_SUCCESS) {
2479 			DPRINT(10, "ibd_attach: failed in "
2480 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2481 			goto attach_fail;
2482 		}
2483 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2484 	}
2485 
2486 	/*
2487 	 * Obtain IBA P_Key, port number and HCA guid and validate
2488 	 * them (for P_Key, only full members are allowed as per
2489 	 * IPoIB specification; neither port number nor HCA guid
2490 	 * can be zero)
2491 	 */
2492 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2493 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2494 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2495 		    state->id_pkey);
2496 		goto attach_fail;
2497 	}
2498 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2499 	    "port-number", 0)) == 0) {
2500 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2501 		    state->id_port);
2502 		goto attach_fail;
2503 	}
2504 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2505 	    "hca-guid", 0)) == 0) {
2506 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2507 		    hca_guid);
2508 		goto attach_fail;
2509 	}
2510 
2511 	/*
2512 	 * Attach to IBTL
2513 	 */
2514 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2515 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2516 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2517 		goto attach_fail;
2518 	}
2519 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2520 
2521 	/*
2522 	 * Open the HCA
2523 	 */
2524 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2525 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2526 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2527 		goto attach_fail;
2528 	}
2529 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2530 
2531 	/*
2532 	 * Record capabilities
2533 	 */
2534 	(void) ibd_record_capab(state, dip);
2535 
2536 	/*
2537 	 * Allocate a protection domain on the HCA
2538 	 */
2539 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2540 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2541 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2542 		goto attach_fail;
2543 	}
2544 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2545 
2546 
2547 	/*
2548 	 * Register ibd interfaces with the Nemo framework
2549 	 */
2550 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2551 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2552 		goto attach_fail;
2553 	}
2554 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2555 
2556 	/*
2557 	 * We're done with everything we could to make the attach
2558 	 * succeed.  All the buffer allocations and IPoIB broadcast
2559 	 * group joins are deferred to when the interface instance
2560 	 * is actually plumbed to avoid wasting memory.
2561 	 */
2562 	return (DDI_SUCCESS);
2563 
2564 attach_fail:
2565 	(void) ibd_unattach(state, dip);
2566 	return (DDI_FAILURE);
2567 }
2568 
2569 /*
2570  * Detach device from the IO framework.
2571  */
2572 static int
2573 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2574 {
2575 	ibd_state_t *state;
2576 	int instance;
2577 
2578 	/*
2579 	 * IBD doesn't support suspend/resume
2580 	 */
2581 	if (cmd != DDI_DETACH)
2582 		return (DDI_FAILURE);
2583 
2584 	/*
2585 	 * Get the instance softstate
2586 	 */
2587 	instance = ddi_get_instance(dip);
2588 	state = ddi_get_soft_state(ibd_list, instance);
2589 
2590 	/*
2591 	 * Release all resources we're holding still.  Note that if we'd
2592 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2593 	 * so far, we should find all the flags we need in id_mac_state.
2594 	 */
2595 	(void) ibd_unattach(state, dip);
2596 
2597 	return (DDI_SUCCESS);
2598 }
2599 
2600 /*
2601  * Pre ibt_attach() driver initialization
2602  */
2603 static int
2604 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2605 {
2606 	char buf[64];
2607 
2608 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2609 	state->id_link_state = LINK_STATE_UNKNOWN;
2610 
2611 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2612 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2613 	state->id_trap_stop = B_TRUE;
2614 	state->id_trap_inprog = 0;
2615 
2616 	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2617 	state->id_dip = dip;
2618 
2619 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2620 
2621 	state->id_tx_list.dl_head = NULL;
2622 	state->id_tx_list.dl_tail = NULL;
2623 	state->id_tx_list.dl_pending_sends = B_FALSE;
2624 	state->id_tx_list.dl_cnt = 0;
2625 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2626 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2627 	state->id_tx_busy = 0;
2628 
2629 	state->id_rx_list.dl_head = NULL;
2630 	state->id_rx_list.dl_tail = NULL;
2631 	state->id_rx_list.dl_bufs_outstanding = 0;
2632 	state->id_rx_list.dl_cnt = 0;
2633 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2634 	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
2635 
2636 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2637 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2638 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2639 
2640 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2641 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2642 
2643 	return (DDI_SUCCESS);
2644 }
2645 
2646 /*
2647  * Post ibt_detach() driver deconstruction
2648  */
2649 static void
2650 ibd_state_fini(ibd_state_t *state)
2651 {
2652 	cv_destroy(&state->id_macst_cv);
2653 	mutex_destroy(&state->id_macst_lock);
2654 
2655 	kmem_cache_destroy(state->id_req_kmc);
2656 
2657 	mutex_destroy(&state->id_rxpost_lock);
2658 	mutex_destroy(&state->id_rx_list.dl_mutex);
2659 
2660 	mutex_destroy(&state->id_txpost_lock);
2661 	mutex_destroy(&state->id_tx_list.dl_mutex);
2662 
2663 	mutex_destroy(&state->id_sched_lock);
2664 	mutex_destroy(&state->id_cq_poll_lock);
2665 
2666 	cv_destroy(&state->id_trap_cv);
2667 	mutex_destroy(&state->id_trap_lock);
2668 	mutex_destroy(&state->id_link_mutex);
2669 }
2670 
2671 /*
2672  * Fetch link speed from SA for snmp ifspeed reporting.
2673  */
2674 static uint64_t
2675 ibd_get_portspeed(ibd_state_t *state)
2676 {
2677 	int			ret;
2678 	ibt_path_info_t		path;
2679 	ibt_path_attr_t		path_attr;
2680 	uint8_t			num_paths;
2681 	uint64_t		ifspeed;
2682 
2683 	/*
2684 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2685 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2686 	 * 2000000000. Start with that as default.
2687 	 */
2688 	ifspeed = 2000000000;
2689 
2690 	bzero(&path_attr, sizeof (path_attr));
2691 
2692 	/*
2693 	 * Get the port speed from Loopback path information.
2694 	 */
2695 	path_attr.pa_dgids = &state->id_sgid;
2696 	path_attr.pa_num_dgids = 1;
2697 	path_attr.pa_sgid = state->id_sgid;
2698 
2699 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2700 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2701 		goto earlydone;
2702 
2703 	if (num_paths < 1)
2704 		goto earlydone;
2705 
2706 	/*
2707 	 * In case SA does not return an expected value, report the default
2708 	 * speed as 1X.
2709 	 */
2710 	ret = 1;
2711 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2712 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2713 			ret = 1;
2714 			break;
2715 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2716 			ret = 4;
2717 			break;
2718 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2719 			ret = 12;
2720 			break;
2721 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2722 			ret = 2;
2723 			break;
2724 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2725 			ret = 8;
2726 			break;
2727 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2728 			ret = 16;
2729 			break;
2730 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2731 			ret = 24;
2732 			break;
2733 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2734 			ret = 32;
2735 			break;
2736 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2737 			ret = 48;
2738 			break;
2739 	}
2740 
2741 	ifspeed *= ret;
2742 
2743 earlydone:
2744 	return (ifspeed);
2745 }
2746 
2747 /*
2748  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2749  * representing the input mcg mgid.
2750  */
2751 static ibd_mce_t *
2752 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2753 {
2754 	ibd_mce_t *ptr = list_head(mlist);
2755 
2756 	/*
2757 	 * Do plain linear search.
2758 	 */
2759 	while (ptr != NULL) {
2760 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2761 		    sizeof (ib_gid_t)) == 0)
2762 			return (ptr);
2763 		ptr = list_next(mlist, ptr);
2764 	}
2765 	return (NULL);
2766 }
2767 
2768 /*
2769  * Execute IBA JOIN.
2770  */
2771 static ibt_status_t
2772 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2773 {
2774 	ibt_mcg_attr_t mcg_attr;
2775 
2776 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2777 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2778 	mcg_attr.mc_mgid = mgid;
2779 	mcg_attr.mc_join_state = mce->mc_jstate;
2780 	mcg_attr.mc_scope = state->id_scope;
2781 	mcg_attr.mc_pkey = state->id_pkey;
2782 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2783 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2784 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2785 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2786 	    NULL, NULL));
2787 }
2788 
2789 /*
2790  * This code JOINs the port in the proper way (depending on the join
2791  * state) so that IBA fabric will forward mcg packets to/from the port.
2792  * It also attaches the QPN to the mcg so it can receive those mcg
2793  * packets. This code makes sure not to attach the mcg to the QP if
2794  * that has been previously done due to the mcg being joined with a
2795  * different join state, even though this is not required by SWG_0216,
2796  * refid 3610.
2797  */
2798 static ibd_mce_t *
2799 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2800 {
2801 	ibt_status_t ibt_status;
2802 	ibd_mce_t *mce, *tmce, *omce = NULL;
2803 	boolean_t do_attach = B_TRUE;
2804 
2805 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2806 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2807 
2808 	/*
2809 	 * For enable_multicast Full member joins, we need to do some
2810 	 * extra work. If there is already an mce on the list that
2811 	 * indicates full membership, that means the membership has
2812 	 * not yet been dropped (since the disable_multicast was issued)
2813 	 * because there are pending Tx's to the mcg; in that case, just
2814 	 * mark the mce not to be reaped when the Tx completion queues
2815 	 * an async reap operation.
2816 	 *
2817 	 * If there is already an mce on the list indicating sendonly
2818 	 * membership, try to promote to full membership. Be careful
2819 	 * not to deallocate the old mce, since there might be an AH
2820 	 * pointing to it; instead, update the old mce with new data
2821 	 * that tracks the full membership.
2822 	 */
2823 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2824 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2825 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2826 			ASSERT(omce->mc_fullreap);
2827 			omce->mc_fullreap = B_FALSE;
2828 			return (omce);
2829 		} else {
2830 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2831 		}
2832 	}
2833 
2834 	/*
2835 	 * Allocate the ibd_mce_t to track this JOIN.
2836 	 */
2837 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2838 	mce->mc_fullreap = B_FALSE;
2839 	mce->mc_jstate = jstate;
2840 
2841 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2842 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2843 		    ibt_status);
2844 		kmem_free(mce, sizeof (ibd_mce_t));
2845 		return (NULL);
2846 	}
2847 
2848 	/*
2849 	 * Is an IBA attach required? Not if the interface is already joined
2850 	 * to the mcg in a different appropriate join state.
2851 	 */
2852 	if (jstate == IB_MC_JSTATE_NON) {
2853 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2854 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2855 			do_attach = B_FALSE;
2856 	} else if (jstate == IB_MC_JSTATE_FULL) {
2857 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2858 			do_attach = B_FALSE;
2859 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2860 		do_attach = B_FALSE;
2861 	}
2862 
2863 	if (do_attach) {
2864 		/*
2865 		 * Do the IBA attach.
2866 		 */
2867 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2868 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2869 		    &mce->mc_info)) != IBT_SUCCESS) {
2870 			DPRINT(10, "ibd_join_group : failed qp attachment "
2871 			    "%d\n", ibt_status);
2872 			/*
2873 			 * NOTE that we should probably preserve the join info
2874 			 * in the list and later try to leave again at detach
2875 			 * time.
2876 			 */
2877 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2878 			    state->id_sgid, jstate);
2879 			kmem_free(mce, sizeof (ibd_mce_t));
2880 			return (NULL);
2881 		}
2882 	}
2883 
2884 	/*
2885 	 * Insert the ibd_mce_t in the proper list.
2886 	 */
2887 	if (jstate == IB_MC_JSTATE_NON) {
2888 		IBD_MCACHE_INSERT_NON(state, mce);
2889 	} else {
2890 		/*
2891 		 * Set up the mc_req fields used for reaping the
2892 		 * mcg in case of delayed tx completion (see
2893 		 * ibd_tx_cleanup()). Also done for sendonly join in
2894 		 * case we are promoted to fullmembership later and
2895 		 * keep using the same mce.
2896 		 */
2897 		mce->mc_req.rq_gid = mgid;
2898 		mce->mc_req.rq_ptr = mce;
2899 		/*
2900 		 * Check whether this is the case of trying to join
2901 		 * full member, and we were already joined send only.
2902 		 * We try to drop our SendOnly membership, but it is
2903 		 * possible that the mcg does not exist anymore (and
2904 		 * the subnet trap never reached us), so the leave
2905 		 * operation might fail.
2906 		 */
2907 		if (omce != NULL) {
2908 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2909 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2910 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2911 			bcopy(&mce->mc_info, &omce->mc_info,
2912 			    sizeof (ibt_mcg_info_t));
2913 			kmem_free(mce, sizeof (ibd_mce_t));
2914 			return (omce);
2915 		}
2916 		mutex_enter(&state->id_mc_mutex);
2917 		IBD_MCACHE_INSERT_FULL(state, mce);
2918 		mutex_exit(&state->id_mc_mutex);
2919 	}
2920 
2921 	return (mce);
2922 }
2923 
2924 /*
2925  * Called during port up event handling to attempt to reacquire full
2926  * membership to an mcg. Stripped down version of ibd_join_group().
2927  * Note that it is possible that the mcg might have gone away, and
2928  * gets recreated at this point.
2929  */
2930 static void
2931 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2932 {
2933 	ib_gid_t mgid;
2934 
2935 	/*
2936 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2937 	 * reap/leave is going to try to leave the group. We could prevent
2938 	 * that by adding a boolean flag into ibd_mce_t, if required.
2939 	 */
2940 	if (mce->mc_fullreap)
2941 		return;
2942 
2943 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2944 
2945 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2946 	    mgid.gid_guid);
2947 
2948 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2949 		ibd_print_warn(state, "Failure on port up to rejoin "
2950 		    "multicast gid %016llx:%016llx",
2951 		    (u_longlong_t)mgid.gid_prefix,
2952 		    (u_longlong_t)mgid.gid_guid);
2953 }
2954 
2955 /*
2956  * This code handles delayed Tx completion cleanups for mcg's to which
2957  * disable_multicast has been issued, regular mcg related cleanups during
2958  * disable_multicast, disable_promiscous and mcg traps, as well as
2959  * cleanups during driver detach time. Depending on the join state,
2960  * it deletes the mce from the appropriate list and issues the IBA
2961  * leave/detach; except in the disable_multicast case when the mce
2962  * is left on the active list for a subsequent Tx completion cleanup.
2963  */
2964 static void
2965 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2966     uint8_t jstate)
2967 {
2968 	ibd_mce_t *tmce;
2969 	boolean_t do_detach = B_TRUE;
2970 
2971 	/*
2972 	 * Before detaching, we must check whether the other list
2973 	 * contains the mcg; if we detach blindly, the consumer
2974 	 * who set up the other list will also stop receiving
2975 	 * traffic.
2976 	 */
2977 	if (jstate == IB_MC_JSTATE_FULL) {
2978 		/*
2979 		 * The following check is only relevant while coming
2980 		 * from the Tx completion path in the reap case.
2981 		 */
2982 		if (!mce->mc_fullreap)
2983 			return;
2984 		mutex_enter(&state->id_mc_mutex);
2985 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2986 		mutex_exit(&state->id_mc_mutex);
2987 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2988 			do_detach = B_FALSE;
2989 	} else if (jstate == IB_MC_JSTATE_NON) {
2990 		IBD_MCACHE_PULLOUT_NON(state, mce);
2991 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2992 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2993 			do_detach = B_FALSE;
2994 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2995 		mutex_enter(&state->id_mc_mutex);
2996 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2997 		mutex_exit(&state->id_mc_mutex);
2998 		do_detach = B_FALSE;
2999 	}
3000 
3001 	/*
3002 	 * If we are reacting to a mcg trap and leaving our sendonly or
3003 	 * non membership, the mcg is possibly already gone, so attempting
3004 	 * to leave might fail. On the other hand, we must try to leave
3005 	 * anyway, since this might be a trap from long ago, and we could
3006 	 * have potentially sendonly joined to a recent incarnation of
3007 	 * the mcg and are about to loose track of this information.
3008 	 */
3009 	if (do_detach) {
3010 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3011 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3012 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3013 	}
3014 
3015 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3016 	kmem_free(mce, sizeof (ibd_mce_t));
3017 }
3018 
3019 /*
3020  * Async code executed due to multicast and promiscuous disable requests
3021  * and mcg trap handling; also executed during driver detach. Mostly, a
3022  * leave and detach is done; except for the fullmember case when Tx
3023  * requests are pending, whence arrangements are made for subsequent
3024  * cleanup on Tx completion.
3025  */
3026 static void
3027 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3028 {
3029 	ipoib_mac_t mcmac;
3030 	boolean_t recycled;
3031 	ibd_mce_t *mce;
3032 
3033 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3034 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3035 
3036 	if (jstate == IB_MC_JSTATE_NON) {
3037 		recycled = B_TRUE;
3038 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3039 		/*
3040 		 * In case we are handling a mcg trap, we might not find
3041 		 * the mcg in the non list.
3042 		 */
3043 		if (mce == NULL) {
3044 			return;
3045 		}
3046 	} else {
3047 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3048 
3049 		/*
3050 		 * In case we are handling a mcg trap, make sure the trap
3051 		 * is not arriving late; if we have an mce that indicates
3052 		 * that we are already a fullmember, that would be a clear
3053 		 * indication that the trap arrived late (ie, is for a
3054 		 * previous incarnation of the mcg).
3055 		 */
3056 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3057 			if ((mce == NULL) || (mce->mc_jstate ==
3058 			    IB_MC_JSTATE_FULL)) {
3059 				return;
3060 			}
3061 		} else {
3062 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3063 
3064 			/*
3065 			 * If join group failed, mce will be NULL here.
3066 			 * This is because in GLDv3 driver, set multicast
3067 			 *  will always return success.
3068 			 */
3069 			if (mce == NULL) {
3070 				return;
3071 			}
3072 
3073 			mce->mc_fullreap = B_TRUE;
3074 		}
3075 
3076 		/*
3077 		 * If no pending Tx's remain that reference the AH
3078 		 * for the mcg, recycle it from active to free list.
3079 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3080 		 * so the last completing Tx will cause an async reap
3081 		 * operation to be invoked, at which time we will drop our
3082 		 * membership to the mcg so that the pending Tx's complete
3083 		 * successfully. Refer to comments on "AH and MCE active
3084 		 * list manipulation" at top of this file. The lock protects
3085 		 * against Tx fast path and Tx cleanup code.
3086 		 */
3087 		mutex_enter(&state->id_ac_mutex);
3088 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3089 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3090 		    IB_MC_JSTATE_SEND_ONLY_NON));
3091 		mutex_exit(&state->id_ac_mutex);
3092 	}
3093 
3094 	if (recycled) {
3095 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3096 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3097 		ibd_async_reap_group(state, mce, mgid, jstate);
3098 	}
3099 }
3100 
3101 /*
3102  * Find the broadcast address as defined by IPoIB; implicitly
3103  * determines the IBA scope, mtu, tclass etc of the link the
3104  * interface is going to be a member of.
3105  */
3106 static ibt_status_t
3107 ibd_find_bgroup(ibd_state_t *state)
3108 {
3109 	ibt_mcg_attr_t mcg_attr;
3110 	uint_t numg;
3111 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3112 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3113 	    IB_MC_SCOPE_GLOBAL };
3114 	int i, mcgmtu;
3115 	boolean_t found = B_FALSE;
3116 	int ret;
3117 	ibt_mcg_info_t mcg_info;
3118 
3119 	state->id_bgroup_created = B_FALSE;
3120 
3121 query_bcast_grp:
3122 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3123 	mcg_attr.mc_pkey = state->id_pkey;
3124 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3125 
3126 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3127 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3128 
3129 		/*
3130 		 * Look for the IPoIB broadcast group.
3131 		 */
3132 		state->id_mgid.gid_prefix =
3133 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3134 		    ((uint64_t)state->id_scope << 48) |
3135 		    ((uint32_t)(state->id_pkey << 16)));
3136 		mcg_attr.mc_mgid = state->id_mgid;
3137 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3138 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3139 			found = B_TRUE;
3140 			break;
3141 		}
3142 	}
3143 
3144 	if (!found) {
3145 		if (ibd_create_broadcast_group) {
3146 			/*
3147 			 * If we created the broadcast group, but failed to
3148 			 * find it, we can't do anything except leave the
3149 			 * one we created and return failure.
3150 			 */
3151 			if (state->id_bgroup_created) {
3152 				ibd_print_warn(state, "IPoIB broadcast group "
3153 				    "absent. Unable to query after create.");
3154 				goto find_bgroup_fail;
3155 			}
3156 
3157 			/*
3158 			 * Create the ipoib broadcast group if it didn't exist
3159 			 */
3160 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3161 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3162 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3163 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3164 			mcg_attr.mc_pkey = state->id_pkey;
3165 			mcg_attr.mc_flow = 0;
3166 			mcg_attr.mc_sl = 0;
3167 			mcg_attr.mc_tclass = 0;
3168 			state->id_mgid.gid_prefix =
3169 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3170 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3171 			    ((uint32_t)(state->id_pkey << 16)));
3172 			mcg_attr.mc_mgid = state->id_mgid;
3173 
3174 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3175 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3176 				ibd_print_warn(state, "IPoIB broadcast group "
3177 				    "absent, create failed: ret = %d\n", ret);
3178 				state->id_bgroup_created = B_FALSE;
3179 				return (IBT_FAILURE);
3180 			}
3181 			state->id_bgroup_created = B_TRUE;
3182 			goto query_bcast_grp;
3183 		} else {
3184 			ibd_print_warn(state, "IPoIB broadcast group absent");
3185 			return (IBT_FAILURE);
3186 		}
3187 	}
3188 
3189 	/*
3190 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3191 	 */
3192 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3193 	if (state->id_mtu < mcgmtu) {
3194 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3195 		    "greater than port's maximum MTU %d", mcgmtu,
3196 		    state->id_mtu);
3197 		ibt_free_mcg_info(state->id_mcinfo, 1);
3198 		goto find_bgroup_fail;
3199 	}
3200 	state->id_mtu = mcgmtu;
3201 
3202 	return (IBT_SUCCESS);
3203 
3204 find_bgroup_fail:
3205 	if (state->id_bgroup_created) {
3206 		(void) ibt_leave_mcg(state->id_sgid,
3207 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3208 		    IB_MC_JSTATE_FULL);
3209 	}
3210 
3211 	return (IBT_FAILURE);
3212 }
3213 
3214 static int
3215 ibd_alloc_tx_copybufs(ibd_state_t *state)
3216 {
3217 	ibt_mr_attr_t mem_attr;
3218 
3219 	/*
3220 	 * Allocate one big chunk for all regular tx copy bufs
3221 	 */
3222 	state->id_tx_buf_sz = state->id_mtu;
3223 	if (state->id_lso_policy && state->id_lso_capable &&
3224 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3225 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3226 	}
3227 
3228 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3229 	    state->id_tx_buf_sz, KM_SLEEP);
3230 
3231 	/*
3232 	 * Do one memory registration on the entire txbuf area
3233 	 */
3234 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3235 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3236 	mem_attr.mr_as = NULL;
3237 	mem_attr.mr_flags = IBT_MR_SLEEP;
3238 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3239 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3240 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3241 		kmem_free(state->id_tx_bufs,
3242 		    state->id_num_swqe * state->id_tx_buf_sz);
3243 		state->id_tx_bufs = NULL;
3244 		return (DDI_FAILURE);
3245 	}
3246 
3247 	return (DDI_SUCCESS);
3248 }
3249 
3250 static int
3251 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3252 {
3253 	ibt_mr_attr_t mem_attr;
3254 	ibd_lsobuf_t *buflist;
3255 	ibd_lsobuf_t *lbufp;
3256 	ibd_lsobuf_t *tail;
3257 	ibd_lsobkt_t *bktp;
3258 	uint8_t *membase;
3259 	uint8_t *memp;
3260 	uint_t memsz;
3261 	int i;
3262 
3263 	/*
3264 	 * Allocate the lso bucket
3265 	 */
3266 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3267 
3268 	/*
3269 	 * Allocate the entire lso memory and register it
3270 	 */
3271 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3272 	membase = kmem_zalloc(memsz, KM_SLEEP);
3273 
3274 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3275 	mem_attr.mr_len = memsz;
3276 	mem_attr.mr_as = NULL;
3277 	mem_attr.mr_flags = IBT_MR_SLEEP;
3278 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3279 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3280 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3281 		kmem_free(membase, memsz);
3282 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3283 		return (DDI_FAILURE);
3284 	}
3285 
3286 	/*
3287 	 * Now allocate the buflist.  Note that the elements in the buflist and
3288 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3289 	 * can always derive the address of a buflist entry from the address of
3290 	 * an lso buffer.
3291 	 */
3292 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3293 	    KM_SLEEP);
3294 
3295 	/*
3296 	 * Set up the lso buf chain
3297 	 */
3298 	memp = membase;
3299 	lbufp = buflist;
3300 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3301 		lbufp->lb_isfree = 1;
3302 		lbufp->lb_buf = memp;
3303 		lbufp->lb_next = lbufp + 1;
3304 
3305 		tail = lbufp;
3306 
3307 		memp += IBD_LSO_BUFSZ;
3308 		lbufp++;
3309 	}
3310 	tail->lb_next = NULL;
3311 
3312 	/*
3313 	 * Set up the LSO buffer information in ibd state
3314 	 */
3315 	bktp->bkt_bufl = buflist;
3316 	bktp->bkt_free_head = buflist;
3317 	bktp->bkt_mem = membase;
3318 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3319 	bktp->bkt_nfree = bktp->bkt_nelem;
3320 
3321 	state->id_lso = bktp;
3322 
3323 	return (DDI_SUCCESS);
3324 }
3325 
3326 /*
3327  * Statically allocate Tx buffer list(s).
3328  */
3329 static int
3330 ibd_init_txlist(ibd_state_t *state)
3331 {
3332 	ibd_swqe_t *swqe;
3333 	ibt_lkey_t lkey;
3334 	int i;
3335 
3336 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3337 		return (DDI_FAILURE);
3338 
3339 	if (state->id_lso_policy && state->id_lso_capable) {
3340 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3341 			state->id_lso_policy = B_FALSE;
3342 	}
3343 
3344 	/*
3345 	 * Allocate and setup the swqe list
3346 	 */
3347 	lkey = state->id_tx_mr_desc.md_lkey;
3348 	for (i = 0; i < state->id_num_swqe; i++) {
3349 		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
3350 			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
3351 			ibd_fini_txlist(state);
3352 			return (DDI_FAILURE);
3353 		}
3354 
3355 		/* add to list */
3356 		state->id_tx_list.dl_cnt++;
3357 		if (state->id_tx_list.dl_head == NULL) {
3358 			swqe->swqe_prev = NULL;
3359 			swqe->swqe_next = NULL;
3360 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3361 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3362 		} else {
3363 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3364 			swqe->swqe_next = NULL;
3365 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3366 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3367 		}
3368 	}
3369 
3370 	return (DDI_SUCCESS);
3371 }
3372 
3373 static int
3374 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3375     uint32_t *nds_p)
3376 {
3377 	ibd_lsobkt_t *bktp;
3378 	ibd_lsobuf_t *lbufp;
3379 	ibd_lsobuf_t *nextp;
3380 	ibt_lkey_t lso_lkey;
3381 	uint_t frag_sz;
3382 	uint_t num_needed;
3383 	int i;
3384 
3385 	ASSERT(sgl_p != NULL);
3386 	ASSERT(nds_p != NULL);
3387 	ASSERT(req_sz != 0);
3388 
3389 	/*
3390 	 * Determine how many bufs we'd need for the size requested
3391 	 */
3392 	num_needed = req_sz / IBD_LSO_BUFSZ;
3393 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3394 		num_needed++;
3395 
3396 	mutex_enter(&state->id_lso_lock);
3397 
3398 	/*
3399 	 * If we don't have enough lso bufs, return failure
3400 	 */
3401 	ASSERT(state->id_lso != NULL);
3402 	bktp = state->id_lso;
3403 	if (bktp->bkt_nfree < num_needed) {
3404 		mutex_exit(&state->id_lso_lock);
3405 		return (-1);
3406 	}
3407 
3408 	/*
3409 	 * Pick the first 'num_needed' bufs from the free list
3410 	 */
3411 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3412 	lbufp = bktp->bkt_free_head;
3413 	for (i = 0; i < num_needed; i++) {
3414 		ASSERT(lbufp->lb_isfree != 0);
3415 		ASSERT(lbufp->lb_buf != NULL);
3416 
3417 		nextp = lbufp->lb_next;
3418 
3419 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3420 		sgl_p[i].ds_key = lso_lkey;
3421 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3422 
3423 		lbufp->lb_isfree = 0;
3424 		lbufp->lb_next = NULL;
3425 
3426 		lbufp = nextp;
3427 	}
3428 	bktp->bkt_free_head = lbufp;
3429 
3430 	/*
3431 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3432 	 * to adjust the last sgl entry's length. Since we know we need atleast
3433 	 * one, the i-1 use below is ok.
3434 	 */
3435 	if (frag_sz) {
3436 		sgl_p[i-1].ds_len = frag_sz;
3437 	}
3438 
3439 	/*
3440 	 * Update nfree count and return
3441 	 */
3442 	bktp->bkt_nfree -= num_needed;
3443 
3444 	mutex_exit(&state->id_lso_lock);
3445 
3446 	*nds_p = num_needed;
3447 
3448 	return (0);
3449 }
3450 
3451 static void
3452 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3453 {
3454 	ibd_lsobkt_t *bktp;
3455 	ibd_lsobuf_t *lbufp;
3456 	uint8_t *lso_mem_end;
3457 	uint_t ndx;
3458 	int i;
3459 
3460 	mutex_enter(&state->id_lso_lock);
3461 
3462 	bktp = state->id_lso;
3463 	ASSERT(bktp != NULL);
3464 
3465 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3466 	for (i = 0; i < nds; i++) {
3467 		uint8_t *va;
3468 
3469 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3470 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3471 
3472 		/*
3473 		 * Figure out the buflist element this sgl buffer corresponds
3474 		 * to and put it back at the head
3475 		 */
3476 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3477 		lbufp = bktp->bkt_bufl + ndx;
3478 
3479 		ASSERT(lbufp->lb_isfree == 0);
3480 		ASSERT(lbufp->lb_buf == va);
3481 
3482 		lbufp->lb_isfree = 1;
3483 		lbufp->lb_next = bktp->bkt_free_head;
3484 		bktp->bkt_free_head = lbufp;
3485 	}
3486 	bktp->bkt_nfree += nds;
3487 
3488 	mutex_exit(&state->id_lso_lock);
3489 }
3490 
3491 static void
3492 ibd_free_tx_copybufs(ibd_state_t *state)
3493 {
3494 	/*
3495 	 * Unregister txbuf mr
3496 	 */
3497 	if (ibt_deregister_mr(state->id_hca_hdl,
3498 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3499 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3500 	}
3501 	state->id_tx_mr_hdl = NULL;
3502 
3503 	/*
3504 	 * Free txbuf memory
3505 	 */
3506 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3507 	state->id_tx_bufs = NULL;
3508 }
3509 
3510 static void
3511 ibd_free_tx_lsobufs(ibd_state_t *state)
3512 {
3513 	ibd_lsobkt_t *bktp;
3514 
3515 	mutex_enter(&state->id_lso_lock);
3516 
3517 	if ((bktp = state->id_lso) == NULL) {
3518 		mutex_exit(&state->id_lso_lock);
3519 		return;
3520 	}
3521 
3522 	/*
3523 	 * First, free the buflist
3524 	 */
3525 	ASSERT(bktp->bkt_bufl != NULL);
3526 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3527 
3528 	/*
3529 	 * Unregister the LSO memory and free it
3530 	 */
3531 	ASSERT(bktp->bkt_mr_hdl != NULL);
3532 	if (ibt_deregister_mr(state->id_hca_hdl,
3533 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3534 		DPRINT(10,
3535 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3536 	}
3537 	ASSERT(bktp->bkt_mem);
3538 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3539 
3540 	/*
3541 	 * Finally free the bucket
3542 	 */
3543 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3544 	state->id_lso = NULL;
3545 
3546 	mutex_exit(&state->id_lso_lock);
3547 }
3548 
3549 /*
3550  * Free the statically allocated Tx buffer list.
3551  */
3552 static void
3553 ibd_fini_txlist(ibd_state_t *state)
3554 {
3555 	ibd_swqe_t *node;
3556 
3557 	/*
3558 	 * Free the allocated swqes
3559 	 */
3560 	mutex_enter(&state->id_tx_list.dl_mutex);
3561 	while (state->id_tx_list.dl_head != NULL) {
3562 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3563 		state->id_tx_list.dl_head = node->swqe_next;
3564 		ASSERT(state->id_tx_list.dl_cnt > 0);
3565 		state->id_tx_list.dl_cnt--;
3566 		ibd_free_swqe(state, node);
3567 	}
3568 	mutex_exit(&state->id_tx_list.dl_mutex);
3569 
3570 	ibd_free_tx_lsobufs(state);
3571 	ibd_free_tx_copybufs(state);
3572 }
3573 
3574 /*
3575  * Allocate a single send wqe and register it so it is almost
3576  * ready to be posted to the hardware.
3577  */
3578 static int
3579 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
3580 {
3581 	ibd_swqe_t *swqe;
3582 
3583 	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
3584 	*wqe = swqe;
3585 
3586 	swqe->swqe_type = IBD_WQE_SEND;
3587 	swqe->swqe_next = NULL;
3588 	swqe->swqe_prev = NULL;
3589 	swqe->swqe_im_mblk = NULL;
3590 
3591 	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3592 	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
3593 	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3594 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3595 
3596 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3597 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3598 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3599 
3600 	/* These are set in send */
3601 	swqe->w_swr.wr_nds = 0;
3602 	swqe->w_swr.wr_sgl = NULL;
3603 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3604 
3605 	return (DDI_SUCCESS);
3606 }
3607 
3608 /*
3609  * Free an allocated send wqe.
3610  */
3611 /*ARGSUSED*/
3612 static void
3613 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3614 {
3615 	kmem_free(swqe, sizeof (ibd_swqe_t));
3616 }
3617 
3618 /*
3619  * Post a rwqe to the hardware and add it to the Rx list. The
3620  * "recycle" parameter indicates whether an old rwqe is being
3621  * recycled, or this is a new one.
3622  */
3623 static int
3624 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3625 {
3626 	ibt_status_t ibt_status;
3627 
3628 	if (recycle == B_FALSE) {
3629 		mutex_enter(&state->id_rx_list.dl_mutex);
3630 		if (state->id_rx_list.dl_head == NULL) {
3631 			rwqe->rwqe_prev = NULL;
3632 			rwqe->rwqe_next = NULL;
3633 			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3634 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3635 		} else {
3636 			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3637 			rwqe->rwqe_next = NULL;
3638 			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3639 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3640 		}
3641 		mutex_exit(&state->id_rx_list.dl_mutex);
3642 	}
3643 
3644 	mutex_enter(&state->id_rxpost_lock);
3645 	if (state->id_rx_busy) {
3646 		rwqe->w_post_link = NULL;
3647 		if (state->id_rx_head)
3648 			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
3649 		else
3650 			state->id_rx_head = rwqe;
3651 		state->id_rx_tailp = &(rwqe->w_post_link);
3652 	} else {
3653 		state->id_rx_busy = 1;
3654 		do {
3655 			mutex_exit(&state->id_rxpost_lock);
3656 
3657 			/*
3658 			 * Here we should add dl_cnt before post recv, because
3659 			 * we would have to make sure dl_cnt is updated before
3660 			 * the corresponding ibd_process_rx() is called.
3661 			 */
3662 			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3663 
3664 			ibt_status = ibt_post_recv(state->id_chnl_hdl,
3665 			    &rwqe->w_rwr, 1, NULL);
3666 			if (ibt_status != IBT_SUCCESS) {
3667 				(void) atomic_add_32_nv(
3668 				    &state->id_rx_list.dl_cnt, -1);
3669 				ibd_print_warn(state, "ibd_post_recv: "
3670 				    "posting failed, ret=%d", ibt_status);
3671 				return (DDI_FAILURE);
3672 			}
3673 
3674 			mutex_enter(&state->id_rxpost_lock);
3675 			rwqe = state->id_rx_head;
3676 			if (rwqe) {
3677 				state->id_rx_head =
3678 				    (ibd_rwqe_t *)(rwqe->w_post_link);
3679 			}
3680 		} while (rwqe);
3681 		state->id_rx_busy = 0;
3682 	}
3683 	mutex_exit(&state->id_rxpost_lock);
3684 
3685 	return (DDI_SUCCESS);
3686 }
3687 
3688 /*
3689  * Allocate the statically allocated Rx buffer list.
3690  */
3691 static int
3692 ibd_init_rxlist(ibd_state_t *state)
3693 {
3694 	ibd_rwqe_t *rwqe;
3695 	int i;
3696 
3697 	for (i = 0; i < state->id_num_rwqe; i++) {
3698 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3699 			ibd_fini_rxlist(state);
3700 			return (DDI_FAILURE);
3701 		}
3702 
3703 		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
3704 			ibd_free_rwqe(state, rwqe);
3705 			ibd_fini_rxlist(state);
3706 			return (DDI_FAILURE);
3707 		}
3708 	}
3709 
3710 	return (DDI_SUCCESS);
3711 }
3712 
3713 /*
3714  * Free the statically allocated Rx buffer list.
3715  *
3716  */
3717 static void
3718 ibd_fini_rxlist(ibd_state_t *state)
3719 {
3720 	ibd_rwqe_t *node;
3721 
3722 	mutex_enter(&state->id_rx_list.dl_mutex);
3723 	while (state->id_rx_list.dl_head != NULL) {
3724 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3725 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3726 		ASSERT(state->id_rx_list.dl_cnt > 0);
3727 		state->id_rx_list.dl_cnt--;
3728 
3729 		ibd_free_rwqe(state, node);
3730 	}
3731 	mutex_exit(&state->id_rx_list.dl_mutex);
3732 }
3733 
3734 /*
3735  * Allocate a single recv wqe and register it so it is almost
3736  * ready to be posted to the hardware.
3737  */
3738 static int
3739 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3740 {
3741 	ibt_mr_attr_t mem_attr;
3742 	ibd_rwqe_t *rwqe;
3743 
3744 	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3745 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3746 		return (DDI_FAILURE);
3747 	}
3748 	*wqe = rwqe;
3749 	rwqe->rwqe_type = IBD_WQE_RECV;
3750 	rwqe->w_state = state;
3751 	rwqe->rwqe_next = NULL;
3752 	rwqe->rwqe_prev = NULL;
3753 	rwqe->w_freeing_wqe = B_FALSE;
3754 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3755 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3756 
3757 	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3758 	    IPOIB_GRH_SIZE, KM_NOSLEEP);
3759 	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
3760 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3761 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3762 		return (DDI_FAILURE);
3763 	}
3764 
3765 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3766 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3767 	    NULL) {
3768 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3769 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3770 		    state->id_mtu + IPOIB_GRH_SIZE);
3771 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3772 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3773 		return (DDI_FAILURE);
3774 	}
3775 
3776 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3777 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3778 	mem_attr.mr_as = NULL;
3779 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3780 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3781 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3782 	    IBT_SUCCESS) {
3783 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3784 		rwqe->w_freeing_wqe = B_TRUE;
3785 		freemsg(rwqe->rwqe_im_mblk);
3786 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3787 		    state->id_mtu + IPOIB_GRH_SIZE);
3788 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3789 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3790 		return (DDI_FAILURE);
3791 	}
3792 
3793 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3794 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3795 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3796 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3797 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3798 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3799 	rwqe->w_rwr.wr_nds = 1;
3800 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3801 
3802 	return (DDI_SUCCESS);
3803 }
3804 
3805 /*
3806  * Free an allocated recv wqe.
3807  */
3808 static void
3809 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3810 {
3811 	if (ibt_deregister_mr(state->id_hca_hdl,
3812 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3813 		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
3814 		return;
3815 	}
3816 
3817 	/*
3818 	 * Indicate to the callback function that this rwqe/mblk
3819 	 * should not be recycled. The freemsg() will invoke
3820 	 * ibd_freemsg_cb().
3821 	 */
3822 	if (rwqe->rwqe_im_mblk != NULL) {
3823 		rwqe->w_freeing_wqe = B_TRUE;
3824 		freemsg(rwqe->rwqe_im_mblk);
3825 	}
3826 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3827 	    state->id_mtu + IPOIB_GRH_SIZE);
3828 	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3829 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3830 }
3831 
3832 /*
3833  * Delete the rwqe being freed from the rx list.
3834  */
3835 static void
3836 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3837 {
3838 	mutex_enter(&state->id_rx_list.dl_mutex);
3839 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3840 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3841 	else
3842 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3843 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3844 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3845 	else
3846 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3847 	mutex_exit(&state->id_rx_list.dl_mutex);
3848 }
3849 
3850 /*
3851  * IBA Rx/Tx completion queue handler. Guaranteed to be single
3852  * threaded and nonreentrant for this CQ. When using combined CQ,
3853  * this handles Tx and Rx completions. With separate CQs, this handles
3854  * only Rx completions.
3855  */
3856 /* ARGSUSED */
3857 static void
3858 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3859 {
3860 	ibd_state_t *state = (ibd_state_t *)arg;
3861 
3862 	atomic_add_64(&state->id_num_intrs, 1);
3863 
3864 	if (ibd_rx_softintr == 1)
3865 		ddi_trigger_softintr(state->id_rx);
3866 	else
3867 		(void) ibd_intr((char *)state);
3868 }
3869 
3870 /*
3871  * Separate CQ handler for Tx completions, when the Tx CQ is in
3872  * interrupt driven mode.
3873  */
3874 /* ARGSUSED */
3875 static void
3876 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3877 {
3878 	ibd_state_t *state = (ibd_state_t *)arg;
3879 
3880 	atomic_add_64(&state->id_num_intrs, 1);
3881 
3882 	if (ibd_tx_softintr == 1)
3883 		ddi_trigger_softintr(state->id_tx);
3884 	else
3885 		(void) ibd_tx_recycle((char *)state);
3886 }
3887 
3888 /*
3889  * Multicast group create/delete trap handler. These will be delivered
3890  * on a kernel thread (handling can thus block) and can be invoked
3891  * concurrently. The handler can be invoked anytime after it is
3892  * registered and before ibt_detach().
3893  */
3894 /* ARGSUSED */
3895 static void
3896 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3897     ibt_subnet_event_t *event)
3898 {
3899 	ibd_state_t *state = (ibd_state_t *)arg;
3900 	ibd_req_t *req;
3901 
3902 	/*
3903 	 * The trap handler will get invoked once for every event for
3904 	 * evert port. The input "gid" is the GID0 of the port the
3905 	 * trap came in on; we just need to act on traps that came
3906 	 * to our port, meaning the port on which the ipoib interface
3907 	 * resides. Since ipoib uses GID0 of the port, we just match
3908 	 * the gids to check whether we need to handle the trap.
3909 	 */
3910 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3911 		return;
3912 
3913 	DPRINT(10, "ibd_notices_handler : %d\n", code);
3914 
3915 	switch (code) {
3916 		case IBT_SM_EVENT_UNAVAILABLE:
3917 			/*
3918 			 * If we are in promiscuous mode or have
3919 			 * sendnonmembers, we need to print a warning
3920 			 * message right now. Else, just store the
3921 			 * information, print when we enter promiscuous
3922 			 * mode or attempt nonmember send. We might
3923 			 * also want to stop caching sendnonmember.
3924 			 */
3925 			ibd_print_warn(state, "IBA multicast support "
3926 			    "degraded due to unavailability of multicast "
3927 			    "traps");
3928 			break;
3929 		case IBT_SM_EVENT_AVAILABLE:
3930 			/*
3931 			 * If we printed a warning message above or
3932 			 * while trying to nonmember send or get into
3933 			 * promiscuous mode, print an okay message.
3934 			 */
3935 			ibd_print_warn(state, "IBA multicast support "
3936 			    "restored due to availability of multicast "
3937 			    "traps");
3938 			break;
3939 		case IBT_SM_EVENT_MCG_CREATED:
3940 		case IBT_SM_EVENT_MCG_DELETED:
3941 			/*
3942 			 * Common processing of creation/deletion traps.
3943 			 * First check if the instance is being
3944 			 * [de]initialized; back off then, without doing
3945 			 * anything more, since we are not sure if the
3946 			 * async thread is around, or whether we might
3947 			 * be racing with the detach code in ibd_m_stop()
3948 			 * that scans the mcg list.
3949 			 */
3950 			if (!ibd_async_safe(state))
3951 				return;
3952 
3953 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
3954 			req->rq_gid = event->sm_notice_gid;
3955 			req->rq_ptr = (void *)code;
3956 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
3957 			break;
3958 	}
3959 }
3960 
3961 static void
3962 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
3963 {
3964 	ib_gid_t mgid = req->rq_gid;
3965 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
3966 
3967 	DPRINT(10, "ibd_async_trap : %d\n", code);
3968 
3969 	/*
3970 	 * Atomically search the nonmember and sendonlymember lists and
3971 	 * delete.
3972 	 */
3973 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
3974 
3975 	if (state->id_prom_op == IBD_OP_COMPLETED) {
3976 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
3977 
3978 		/*
3979 		 * If in promiscuous mode, try to join/attach to the new
3980 		 * mcg. Given the unreliable out-of-order mode of trap
3981 		 * delivery, we can never be sure whether it is a problem
3982 		 * if the join fails. Thus, we warn the admin of a failure
3983 		 * if this was a creation trap. Note that the trap might
3984 		 * actually be reporting a long past event, and the mcg
3985 		 * might already have been deleted, thus we might be warning
3986 		 * in vain.
3987 		 */
3988 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
3989 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
3990 			ibd_print_warn(state, "IBA promiscuous mode missed "
3991 			    "new multicast gid %016llx:%016llx",
3992 			    (u_longlong_t)mgid.gid_prefix,
3993 			    (u_longlong_t)mgid.gid_guid);
3994 	}
3995 
3996 	/*
3997 	 * Free the request slot allocated by the subnet event thread.
3998 	 */
3999 	ibd_async_done(state);
4000 }
4001 
4002 /*
4003  * GLDv3 entry point to get capabilities.
4004  */
4005 static boolean_t
4006 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4007 {
4008 	ibd_state_t *state = arg;
4009 
4010 	switch (cap) {
4011 	case MAC_CAPAB_HCKSUM: {
4012 		uint32_t *txflags = cap_data;
4013 
4014 		/*
4015 		 * We either do full checksum or not do it at all
4016 		 */
4017 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4018 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4019 		else
4020 			return (B_FALSE);
4021 		break;
4022 	}
4023 
4024 	case MAC_CAPAB_LSO: {
4025 		mac_capab_lso_t *cap_lso = cap_data;
4026 
4027 		/*
4028 		 * In addition to the capability and policy, since LSO
4029 		 * relies on hw checksum, we'll not enable LSO if we
4030 		 * don't have hw checksum.  Of course, if the HCA doesn't
4031 		 * provide the reserved lkey capability, enabling LSO will
4032 		 * actually affect performance adversely, so we'll disable
4033 		 * LSO even for that case.
4034 		 */
4035 		if (!state->id_lso_policy || !state->id_lso_capable)
4036 			return (B_FALSE);
4037 
4038 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4039 			return (B_FALSE);
4040 
4041 		if (state->id_hca_res_lkey_capab == 0) {
4042 			ibd_print_warn(state, "no reserved-lkey capability, "
4043 			    "disabling LSO");
4044 			return (B_FALSE);
4045 		}
4046 
4047 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4048 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4049 		break;
4050 	}
4051 
4052 	default:
4053 		return (B_FALSE);
4054 	}
4055 
4056 	return (B_TRUE);
4057 }
4058 
4059 static int
4060 ibd_get_port_details(ibd_state_t *state)
4061 {
4062 	ibt_hca_portinfo_t *port_infop;
4063 	ibt_status_t ret;
4064 	uint_t psize, port_infosz;
4065 
4066 	mutex_enter(&state->id_link_mutex);
4067 
4068 	/*
4069 	 * Query for port information
4070 	 */
4071 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4072 	    &port_infop, &psize, &port_infosz);
4073 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4074 		mutex_exit(&state->id_link_mutex);
4075 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4076 		    "failed, ret=%d", ret);
4077 		return (ENETDOWN);
4078 	}
4079 
4080 	/*
4081 	 * If the link already went down by the time we get here,
4082 	 * give up
4083 	 */
4084 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4085 		mutex_exit(&state->id_link_mutex);
4086 		ibt_free_portinfo(port_infop, port_infosz);
4087 		DPRINT(10, "ibd_get_port_details: port is not active");
4088 		return (ENETDOWN);
4089 	}
4090 
4091 	/*
4092 	 * If the link is active, verify the pkey
4093 	 */
4094 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4095 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4096 		mutex_exit(&state->id_link_mutex);
4097 		ibt_free_portinfo(port_infop, port_infosz);
4098 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4099 		    "failed, ret=%d", ret);
4100 		return (ENONET);
4101 	}
4102 
4103 	state->id_mtu = (128 << port_infop->p_mtu);
4104 	state->id_sgid = *port_infop->p_sgid_tbl;
4105 	state->id_link_state = LINK_STATE_UP;
4106 
4107 	mutex_exit(&state->id_link_mutex);
4108 	ibt_free_portinfo(port_infop, port_infosz);
4109 
4110 	/*
4111 	 * Now that the port is active, record the port speed
4112 	 */
4113 	state->id_link_speed = ibd_get_portspeed(state);
4114 
4115 	return (0);
4116 }
4117 
4118 static int
4119 ibd_alloc_cqs(ibd_state_t *state)
4120 {
4121 	ibt_hca_attr_t hca_attrs;
4122 	ibt_cq_attr_t cq_attr;
4123 	ibt_status_t ret;
4124 	uint32_t real_size;
4125 
4126 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4127 	ASSERT(ret == IBT_SUCCESS);
4128 
4129 	/*
4130 	 * Allocate Rx/combined CQ:
4131 	 * Theoretically, there is no point in having more than #rwqe
4132 	 * plus #swqe cqe's, except that the CQ will be signalled for
4133 	 * overflow when the last wqe completes, if none of the previous
4134 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4135 	 * to make sure such overflow does not occur.
4136 	 */
4137 	cq_attr.cq_sched = NULL;
4138 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4139 
4140 	if (ibd_separate_cqs == 1) {
4141 		/*
4142 		 * Allocate Receive CQ.
4143 		 */
4144 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4145 			cq_attr.cq_size = state->id_num_rwqe + 1;
4146 		} else {
4147 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4148 			state->id_num_rwqe = cq_attr.cq_size - 1;
4149 		}
4150 
4151 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4152 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4153 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4154 			    "failed, ret=%d\n", ret);
4155 			return (DDI_FAILURE);
4156 		}
4157 
4158 		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4159 		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4160 			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4161 			    "moderation failed, ret=%d\n", ret);
4162 		}
4163 
4164 		state->id_rxwcs_size = state->id_num_rwqe + 1;
4165 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4166 		    state->id_rxwcs_size, KM_SLEEP);
4167 
4168 		/*
4169 		 * Allocate Send CQ.
4170 		 */
4171 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4172 			cq_attr.cq_size = state->id_num_swqe + 1;
4173 		} else {
4174 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4175 			state->id_num_swqe = cq_attr.cq_size - 1;
4176 		}
4177 
4178 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4179 		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4180 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4181 			    "failed, ret=%d\n", ret);
4182 			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4183 			    state->id_rxwcs_size);
4184 			(void) ibt_free_cq(state->id_rcq_hdl);
4185 			return (DDI_FAILURE);
4186 		}
4187 		if ((ret = ibt_modify_cq(state->id_scq_hdl,
4188 		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
4189 			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4190 			    "moderation failed, ret=%d\n", ret);
4191 		}
4192 
4193 		state->id_txwcs_size = state->id_num_swqe + 1;
4194 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4195 		    state->id_txwcs_size, KM_SLEEP);
4196 	} else {
4197 		/*
4198 		 * Allocate combined Send/Receive CQ.
4199 		 */
4200 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
4201 		    state->id_num_swqe + 1)) {
4202 			cq_attr.cq_size = state->id_num_rwqe +
4203 			    state->id_num_swqe + 1;
4204 		} else {
4205 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4206 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
4207 			    state->id_num_rwqe) / (state->id_num_rwqe +
4208 			    state->id_num_swqe);
4209 			state->id_num_swqe = cq_attr.cq_size - 1 -
4210 			    state->id_num_rwqe;
4211 		}
4212 
4213 		state->id_rxwcs_size = cq_attr.cq_size;
4214 		state->id_txwcs_size = state->id_rxwcs_size;
4215 
4216 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4217 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4218 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
4219 			    "failed, ret=%d\n", ret);
4220 			return (DDI_FAILURE);
4221 		}
4222 		state->id_scq_hdl = state->id_rcq_hdl;
4223 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4224 		    state->id_rxwcs_size, KM_SLEEP);
4225 		state->id_txwcs = state->id_rxwcs;
4226 	}
4227 
4228 	/*
4229 	 * Print message in case we could not allocate as many wqe's
4230 	 * as was requested.
4231 	 */
4232 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4233 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4234 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4235 	}
4236 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4237 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4238 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4239 	}
4240 
4241 	return (DDI_SUCCESS);
4242 }
4243 
4244 static int
4245 ibd_setup_ud_channel(ibd_state_t *state)
4246 {
4247 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4248 	ibt_ud_chan_query_attr_t ud_chan_attr;
4249 	ibt_status_t ret;
4250 
4251 	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
4252 	if (state->id_hca_res_lkey_capab)
4253 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4254 	if (state->id_lso_policy && state->id_lso_capable)
4255 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4256 
4257 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4258 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4259 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4260 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4261 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4262 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4263 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4264 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4265 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4266 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4267 	ud_alloc_attr.ud_clone_chan	= NULL;
4268 
4269 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4270 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4271 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4272 		    "failed, ret=%d\n", ret);
4273 		return (DDI_FAILURE);
4274 	}
4275 
4276 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4277 	    &ud_chan_attr)) != IBT_SUCCESS) {
4278 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4279 		    "failed, ret=%d\n", ret);
4280 		(void) ibt_free_channel(state->id_chnl_hdl);
4281 		return (DDI_FAILURE);
4282 	}
4283 
4284 	state->id_qpnum = ud_chan_attr.ud_qpn;
4285 
4286 	return (DDI_SUCCESS);
4287 }
4288 
4289 static int
4290 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4291 {
4292 	uint32_t progress = state->id_mac_state;
4293 	uint_t attempts;
4294 	ibt_status_t ret;
4295 	ib_gid_t mgid;
4296 	ibd_mce_t *mce;
4297 	uint8_t jstate;
4298 
4299 	/*
4300 	 * Before we try to stop/undo whatever we did in ibd_start(),
4301 	 * we need to mark the link state appropriately to prevent the
4302 	 * ip layer from using this instance for any new transfers. Note
4303 	 * that if the original state of the link was "up" when we're
4304 	 * here, we'll set the final link state to "unknown", to behave
4305 	 * in the same fashion as other ethernet drivers.
4306 	 */
4307 	mutex_enter(&state->id_link_mutex);
4308 	if (cur_link_state == LINK_STATE_DOWN) {
4309 		state->id_link_state = cur_link_state;
4310 	} else {
4311 		state->id_link_state = LINK_STATE_UNKNOWN;
4312 	}
4313 	mutex_exit(&state->id_link_mutex);
4314 	mac_link_update(state->id_mh, state->id_link_state);
4315 
4316 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4317 	if (progress & IBD_DRV_STARTED) {
4318 		state->id_mac_state &= (~IBD_DRV_STARTED);
4319 	}
4320 
4321 	/*
4322 	 * First, stop receive interrupts; this stops the driver from
4323 	 * handing up buffers to higher layers.  Wait for receive buffers
4324 	 * to be returned and give up after 5 seconds.
4325 	 */
4326 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4327 
4328 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4329 
4330 		attempts = 50;
4331 		while (state->id_rx_list.dl_bufs_outstanding > 0) {
4332 			delay(drv_usectohz(100000));
4333 			if (--attempts == 0) {
4334 				/*
4335 				 * There are pending bufs with the network
4336 				 * layer and we have no choice but to wait
4337 				 * for them to be done with. Reap all the
4338 				 * Tx/Rx completions that were posted since
4339 				 * we turned off the notification and
4340 				 * return failure.
4341 				 */
4342 				DPRINT(2, "ibd_undo_start: "
4343 				    "reclaiming failed");
4344 				ibd_poll_compq(state, state->id_rcq_hdl);
4345 				ibt_set_cq_handler(state->id_rcq_hdl,
4346 				    ibd_rcq_handler, state);
4347 				return (DDI_FAILURE);
4348 			}
4349 		}
4350 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4351 	}
4352 
4353 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4354 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4355 
4356 		mutex_enter(&state->id_trap_lock);
4357 		state->id_trap_stop = B_TRUE;
4358 		while (state->id_trap_inprog > 0)
4359 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4360 		mutex_exit(&state->id_trap_lock);
4361 
4362 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4363 	}
4364 
4365 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4366 		/*
4367 		 * Flushing the channel ensures that all pending WQE's
4368 		 * are marked with flush_error and handed to the CQ. It
4369 		 * does not guarantee the invocation of the CQ handler.
4370 		 * This call is guaranteed to return successfully for
4371 		 * UD QPNs.
4372 		 */
4373 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4374 		    IBT_SUCCESS) {
4375 			DPRINT(10, "ibd_undo_start: flush_channel "
4376 			    "failed, ret=%d", ret);
4377 		}
4378 
4379 		/*
4380 		 * Turn off Tx interrupts and poll. By the time the polling
4381 		 * returns an empty indicator, we are sure we have seen all
4382 		 * pending Tx callbacks. Note that after the call to
4383 		 * ibt_set_cq_handler() returns, the old handler is
4384 		 * guaranteed not to be invoked anymore.
4385 		 */
4386 		if (ibd_separate_cqs == 1) {
4387 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4388 		}
4389 		ibd_poll_compq(state, state->id_scq_hdl);
4390 
4391 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4392 	}
4393 
4394 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4395 		/*
4396 		 * No new async requests will be posted since the device
4397 		 * link state has been marked as unknown; completion handlers
4398 		 * have been turned off, so Tx handler will not cause any
4399 		 * more IBD_ASYNC_REAP requests.
4400 		 *
4401 		 * Queue a request for the async thread to exit, which will
4402 		 * be serviced after any pending ones. This can take a while,
4403 		 * specially if the SM is unreachable, since IBMF will slowly
4404 		 * timeout each SM request issued by the async thread.  Reap
4405 		 * the thread before continuing on, we do not want it to be
4406 		 * lingering in modunloaded code (or we could move the reap
4407 		 * to ibd_detach(), provided we keep track of the current
4408 		 * id_async_thrid somewhere safe).
4409 		 */
4410 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4411 		thread_join(state->id_async_thrid);
4412 
4413 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4414 	}
4415 
4416 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4417 		/*
4418 		 * Drop all residual full/non membership. This includes full
4419 		 * membership to the broadcast group, and any nonmembership
4420 		 * acquired during transmits. We do this after the Tx completion
4421 		 * handlers are done, since those might result in some late
4422 		 * leaves; this also eliminates a potential race with that
4423 		 * path wrt the mc full list insert/delete. Trap handling
4424 		 * has also been suppressed at this point. Thus, no locks
4425 		 * are required while traversing the mc full list.
4426 		 */
4427 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4428 		mce = list_head(&state->id_mc_full);
4429 		while (mce != NULL) {
4430 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4431 			jstate = mce->mc_jstate;
4432 			mce = list_next(&state->id_mc_full, mce);
4433 			ibd_leave_group(state, mgid, jstate);
4434 		}
4435 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4436 	}
4437 
4438 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4439 		ibd_fini_rxlist(state);
4440 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4441 	}
4442 
4443 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4444 		ibd_fini_txlist(state);
4445 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4446 	}
4447 
4448 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4449 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4450 		    IBT_SUCCESS) {
4451 			DPRINT(10, "ibd_undo_start: free_channel "
4452 			    "failed, ret=%d", ret);
4453 		}
4454 
4455 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4456 	}
4457 
4458 	if (progress & IBD_DRV_CQS_ALLOCD) {
4459 		if (ibd_separate_cqs == 1) {
4460 			kmem_free(state->id_txwcs,
4461 			    sizeof (ibt_wc_t) * state->id_txwcs_size);
4462 			if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4463 			    IBT_SUCCESS) {
4464 				DPRINT(10, "ibd_undo_start: free_cq(scq) "
4465 				    "failed, ret=%d", ret);
4466 			}
4467 		}
4468 
4469 		kmem_free(state->id_rxwcs,
4470 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4471 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4472 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4473 			    "ret=%d", ret);
4474 		}
4475 
4476 		state->id_txwcs = NULL;
4477 		state->id_rxwcs = NULL;
4478 		state->id_scq_hdl = NULL;
4479 		state->id_rcq_hdl = NULL;
4480 
4481 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4482 	}
4483 
4484 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4485 		mod_hash_destroy_hash(state->id_ah_active_hash);
4486 		ibd_acache_fini(state);
4487 
4488 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4489 	}
4490 
4491 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4492 		/*
4493 		 * If we'd created the ipoib broadcast group and had
4494 		 * successfully joined it, leave it now
4495 		 */
4496 		if (state->id_bgroup_created) {
4497 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4498 			jstate = IB_MC_JSTATE_FULL;
4499 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4500 			    state->id_sgid, jstate);
4501 		}
4502 		ibt_free_mcg_info(state->id_mcinfo, 1);
4503 
4504 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4505 	}
4506 
4507 	return (DDI_SUCCESS);
4508 }
4509 
4510 /*
4511  * These pair of routines are used to set/clear the condition that
4512  * the caller is likely to do something to change the id_mac_state.
4513  * If there's already someone doing either a start or a stop (possibly
4514  * due to the async handler detecting a pkey relocation event, a plumb
4515  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4516  * that's done.
4517  */
4518 static void
4519 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4520 {
4521 	mutex_enter(&state->id_macst_lock);
4522 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4523 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4524 
4525 	state->id_mac_state |= flag;
4526 	mutex_exit(&state->id_macst_lock);
4527 }
4528 
4529 static void
4530 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4531 {
4532 	mutex_enter(&state->id_macst_lock);
4533 	state->id_mac_state &= (~flag);
4534 	cv_signal(&state->id_macst_cv);
4535 	mutex_exit(&state->id_macst_lock);
4536 }
4537 
4538 /*
4539  * GLDv3 entry point to start hardware.
4540  */
4541 /*ARGSUSED*/
4542 static int
4543 ibd_m_start(void *arg)
4544 {
4545 	ibd_state_t *state = arg;
4546 	int	ret;
4547 
4548 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4549 
4550 	ret = ibd_start(state);
4551 
4552 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4553 
4554 	return (ret);
4555 }
4556 
4557 static int
4558 ibd_start(ibd_state_t *state)
4559 {
4560 	kthread_t *kht;
4561 	int err;
4562 	ibt_status_t ret;
4563 
4564 	if (state->id_mac_state & IBD_DRV_STARTED)
4565 		return (DDI_SUCCESS);
4566 
4567 	/*
4568 	 * Get port details; if we fail here, very likely the port
4569 	 * state is inactive or the pkey can't be verified.
4570 	 */
4571 	if ((err = ibd_get_port_details(state)) != 0) {
4572 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4573 		goto start_fail;
4574 	}
4575 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4576 
4577 	/*
4578 	 * Find the IPoIB broadcast group
4579 	 */
4580 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4581 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4582 		err = ENOTACTIVE;
4583 		goto start_fail;
4584 	}
4585 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4586 
4587 	/*
4588 	 * Initialize per-interface caches and lists; if we fail here,
4589 	 * it is most likely due to a lack of resources
4590 	 */
4591 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4592 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4593 		err = ENOMEM;
4594 		goto start_fail;
4595 	}
4596 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4597 
4598 	/*
4599 	 * Allocate send and receive completion queues
4600 	 */
4601 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4602 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4603 		err = ENOMEM;
4604 		goto start_fail;
4605 	}
4606 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4607 
4608 	/*
4609 	 * Setup a UD channel
4610 	 */
4611 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4612 		err = ENOMEM;
4613 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4614 		goto start_fail;
4615 	}
4616 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4617 
4618 	/*
4619 	 * Allocate and initialize the tx buffer list
4620 	 */
4621 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4622 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4623 		err = ENOMEM;
4624 		goto start_fail;
4625 	}
4626 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4627 
4628 	/*
4629 	 * If we have separate cqs, create the send cq handler here
4630 	 */
4631 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
4632 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4633 		if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4634 		    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4635 			DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4636 			    "failed, ret=%d", ret);
4637 			err = EINVAL;
4638 			goto start_fail;
4639 		}
4640 		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4641 	}
4642 
4643 	/*
4644 	 * Allocate and initialize the rx buffer list
4645 	 */
4646 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4647 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4648 		err = ENOMEM;
4649 		goto start_fail;
4650 	}
4651 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4652 
4653 	/*
4654 	 * Join IPoIB broadcast group
4655 	 */
4656 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4657 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4658 		err = ENOTACTIVE;
4659 		goto start_fail;
4660 	}
4661 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4662 
4663 	/*
4664 	 * Create the async thread; thread_create never fails.
4665 	 */
4666 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4667 	    TS_RUN, minclsyspri);
4668 	state->id_async_thrid = kht->t_did;
4669 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4670 
4671 	/*
4672 	 * When we did mac_register() in ibd_attach(), we didn't register
4673 	 * the real macaddr and we didn't have the true port mtu. Now that
4674 	 * we're almost ready, set the local mac address and broadcast
4675 	 * addresses and update gldv3 about the real values of these
4676 	 * parameters.
4677 	 */
4678 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4679 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4680 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4681 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4682 
4683 	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4684 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4685 
4686 	/*
4687 	 * Setup the receive cq handler
4688 	 */
4689 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4690 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4691 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4692 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4693 		    "failed, ret=%d", ret);
4694 		err = EINVAL;
4695 		goto start_fail;
4696 	}
4697 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4698 
4699 	/*
4700 	 * Setup the subnet notices handler after we've initialized the acache/
4701 	 * mcache and started the async thread, both of which are required for
4702 	 * the trap handler to function properly.
4703 	 *
4704 	 * Now that the async thread has been started (and we've already done
4705 	 * a mac_register() during attach so mac_tx_update() can be called
4706 	 * if necessary without any problem), we can enable the trap handler
4707 	 * to queue requests to the async thread.
4708 	 */
4709 	ibt_register_subnet_notices(state->id_ibt_hdl,
4710 	    ibd_snet_notices_handler, state);
4711 	mutex_enter(&state->id_trap_lock);
4712 	state->id_trap_stop = B_FALSE;
4713 	mutex_exit(&state->id_trap_lock);
4714 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4715 
4716 	/*
4717 	 * Indicate link status to GLDv3 and higher layers. By default,
4718 	 * we assume we are in up state (which must have been true at
4719 	 * least at the time the broadcast mcg's were probed); if there
4720 	 * were any up/down transitions till the time we come here, the
4721 	 * async handler will have updated last known state, which we
4722 	 * use to tell GLDv3. The async handler will not send any
4723 	 * notifications to GLDv3 till we reach here in the initialization
4724 	 * sequence.
4725 	 */
4726 	state->id_mac_state |= IBD_DRV_STARTED;
4727 	mac_link_update(state->id_mh, state->id_link_state);
4728 
4729 	return (DDI_SUCCESS);
4730 
4731 start_fail:
4732 	/*
4733 	 * If we ran into a problem during ibd_start() and ran into
4734 	 * some other problem during undoing our partial work, we can't
4735 	 * do anything about it.  Ignore any errors we might get from
4736 	 * ibd_undo_start() and just return the original error we got.
4737 	 */
4738 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
4739 	return (err);
4740 }
4741 
4742 /*
4743  * GLDv3 entry point to stop hardware from receiving packets.
4744  */
4745 /*ARGSUSED*/
4746 static void
4747 ibd_m_stop(void *arg)
4748 {
4749 	ibd_state_t *state = (ibd_state_t *)arg;
4750 
4751 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4752 
4753 	(void) ibd_undo_start(state, state->id_link_state);
4754 
4755 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4756 }
4757 
4758 /*
4759  * GLDv3 entry point to modify device's mac address. We do not
4760  * allow address modifications.
4761  */
4762 static int
4763 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4764 {
4765 	ibd_state_t *state = arg;
4766 
4767 	/*
4768 	 * Don't bother even comparing the macaddr if we haven't
4769 	 * completed ibd_m_start().
4770 	 */
4771 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4772 		return (0);
4773 
4774 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4775 		return (0);
4776 	else
4777 		return (EINVAL);
4778 }
4779 
4780 /*
4781  * The blocking part of the IBA join/leave operations are done out
4782  * of here on the async thread.
4783  */
4784 static void
4785 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4786 {
4787 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4788 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4789 
4790 	if (op == IBD_ASYNC_JOIN) {
4791 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4792 			ibd_print_warn(state, "Joint multicast group failed :"
4793 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4794 		}
4795 	} else {
4796 		/*
4797 		 * Here, we must search for the proper mcg_info and
4798 		 * use that to leave the group.
4799 		 */
4800 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4801 	}
4802 }
4803 
4804 /*
4805  * GLDv3 entry point for multicast enable/disable requests.
4806  * This function queues the operation to the async thread and
4807  * return success for a valid multicast address.
4808  */
4809 static int
4810 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4811 {
4812 	ibd_state_t *state = (ibd_state_t *)arg;
4813 	ipoib_mac_t maddr, *mcast;
4814 	ib_gid_t mgid;
4815 	ibd_req_t *req;
4816 
4817 	/*
4818 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4819 	 * have been started and id_bcaddr wouldn't be set, so there's
4820 	 * no point in continuing.
4821 	 */
4822 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4823 		return (0);
4824 
4825 	/*
4826 	 * The incoming multicast address might not be aligned properly
4827 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4828 	 * it to look like one though, to get the offsets of the mc gid,
4829 	 * since we know we are not going to dereference any values with
4830 	 * the ipoib_mac_t pointer.
4831 	 */
4832 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4833 	mcast = &maddr;
4834 
4835 	/*
4836 	 * Check validity of MCG address. We could additionally check
4837 	 * that a enable/disable is not being issued on the "broadcast"
4838 	 * mcg, but since this operation is only invokable by priviledged
4839 	 * programs anyway, we allow the flexibility to those dlpi apps.
4840 	 * Note that we do not validate the "scope" of the IBA mcg.
4841 	 */
4842 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4843 		return (EINVAL);
4844 
4845 	/*
4846 	 * fill in multicast pkey and scope
4847 	 */
4848 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4849 
4850 	/*
4851 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4852 	 * nothing (i.e. we stay JOINed to the broadcast group done in
4853 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
4854 	 * requires to be joined to broadcast groups at all times.
4855 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4856 	 * depends on this.
4857 	 */
4858 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4859 		return (0);
4860 
4861 	ibd_n2h_gid(mcast, &mgid);
4862 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4863 	if (req == NULL)
4864 		return (ENOMEM);
4865 
4866 	req->rq_gid = mgid;
4867 
4868 	if (add) {
4869 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4870 		    mgid.gid_prefix, mgid.gid_guid);
4871 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4872 	} else {
4873 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4874 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4875 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4876 	}
4877 	return (0);
4878 }
4879 
4880 /*
4881  * The blocking part of the IBA promiscuous operations are done
4882  * out of here on the async thread. The dlpireq parameter indicates
4883  * whether this invocation is due to a dlpi request or due to
4884  * a port up/down event.
4885  */
4886 static void
4887 ibd_async_unsetprom(ibd_state_t *state)
4888 {
4889 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4890 	ib_gid_t mgid;
4891 
4892 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4893 
4894 	while (mce != NULL) {
4895 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4896 		mce = list_next(&state->id_mc_non, mce);
4897 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4898 	}
4899 	state->id_prom_op = IBD_OP_NOTSTARTED;
4900 }
4901 
4902 /*
4903  * The blocking part of the IBA promiscuous operations are done
4904  * out of here on the async thread. The dlpireq parameter indicates
4905  * whether this invocation is due to a dlpi request or due to
4906  * a port up/down event.
4907  */
4908 static void
4909 ibd_async_setprom(ibd_state_t *state)
4910 {
4911 	ibt_mcg_attr_t mcg_attr;
4912 	ibt_mcg_info_t *mcg_info;
4913 	ib_gid_t mgid;
4914 	uint_t numg;
4915 	int i;
4916 	char ret = IBD_OP_COMPLETED;
4917 
4918 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4919 
4920 	/*
4921 	 * Obtain all active MC groups on the IB fabric with
4922 	 * specified criteria (scope + Pkey + Qkey + mtu).
4923 	 */
4924 	bzero(&mcg_attr, sizeof (mcg_attr));
4925 	mcg_attr.mc_pkey = state->id_pkey;
4926 	mcg_attr.mc_scope = state->id_scope;
4927 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4928 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4929 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4930 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4931 	    IBT_SUCCESS) {
4932 		ibd_print_warn(state, "Could not get list of IBA multicast "
4933 		    "groups");
4934 		ret = IBD_OP_ERRORED;
4935 		goto done;
4936 	}
4937 
4938 	/*
4939 	 * Iterate over the returned mcg's and join as NonMember
4940 	 * to the IP mcg's.
4941 	 */
4942 	for (i = 0; i < numg; i++) {
4943 		/*
4944 		 * Do a NonMember JOIN on the MC group.
4945 		 */
4946 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4947 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4948 			ibd_print_warn(state, "IBA promiscuous mode missed "
4949 			    "multicast gid %016llx:%016llx",
4950 			    (u_longlong_t)mgid.gid_prefix,
4951 			    (u_longlong_t)mgid.gid_guid);
4952 	}
4953 
4954 	ibt_free_mcg_info(mcg_info, numg);
4955 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4956 done:
4957 	state->id_prom_op = ret;
4958 }
4959 
4960 /*
4961  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4962  * GLDv3 assumes phys state receives more packets than multi state,
4963  * which is not true for IPoIB. Thus, treat the multi and phys
4964  * promiscuous states the same way to work with GLDv3's assumption.
4965  */
4966 static int
4967 ibd_m_promisc(void *arg, boolean_t on)
4968 {
4969 	ibd_state_t *state = (ibd_state_t *)arg;
4970 	ibd_req_t *req;
4971 
4972 	/*
4973 	 * Async thread wouldn't have been started if we haven't
4974 	 * passed ibd_m_start()
4975 	 */
4976 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4977 		return (0);
4978 
4979 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4980 	if (req == NULL)
4981 		return (ENOMEM);
4982 	if (on) {
4983 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4984 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
4985 	} else {
4986 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4987 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
4988 	}
4989 
4990 	return (0);
4991 }
4992 
4993 /*
4994  * GLDv3 entry point for gathering statistics.
4995  */
4996 static int
4997 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
4998 {
4999 	ibd_state_t *state = (ibd_state_t *)arg;
5000 
5001 	switch (stat) {
5002 	case MAC_STAT_IFSPEED:
5003 		*val = state->id_link_speed;
5004 		break;
5005 	case MAC_STAT_MULTIRCV:
5006 		*val = state->id_multi_rcv;
5007 		break;
5008 	case MAC_STAT_BRDCSTRCV:
5009 		*val = state->id_brd_rcv;
5010 		break;
5011 	case MAC_STAT_MULTIXMT:
5012 		*val = state->id_multi_xmt;
5013 		break;
5014 	case MAC_STAT_BRDCSTXMT:
5015 		*val = state->id_brd_xmt;
5016 		break;
5017 	case MAC_STAT_RBYTES:
5018 		*val = state->id_rcv_bytes;
5019 		break;
5020 	case MAC_STAT_IPACKETS:
5021 		*val = state->id_rcv_pkt;
5022 		break;
5023 	case MAC_STAT_OBYTES:
5024 		*val = state->id_xmt_bytes;
5025 		break;
5026 	case MAC_STAT_OPACKETS:
5027 		*val = state->id_xmt_pkt;
5028 		break;
5029 	case MAC_STAT_OERRORS:
5030 		*val = state->id_ah_error;	/* failed AH translation */
5031 		break;
5032 	case MAC_STAT_IERRORS:
5033 		*val = 0;
5034 		break;
5035 	case MAC_STAT_NOXMTBUF:
5036 		*val = state->id_tx_short;
5037 		break;
5038 	case MAC_STAT_NORCVBUF:
5039 	default:
5040 		return (ENOTSUP);
5041 	}
5042 
5043 	return (0);
5044 }
5045 
5046 static void
5047 ibd_async_txsched(ibd_state_t *state)
5048 {
5049 	ibd_req_t *req;
5050 	int ret;
5051 
5052 	if (ibd_txcomp_poll)
5053 		ibd_poll_compq(state, state->id_scq_hdl);
5054 
5055 	ret = ibd_resume_transmission(state);
5056 	if (ret && ibd_txcomp_poll) {
5057 		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
5058 			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5059 		else {
5060 			ibd_print_warn(state, "ibd_async_txsched: "
5061 			    "no memory, can't schedule work slot");
5062 		}
5063 	}
5064 }
5065 
5066 static int
5067 ibd_resume_transmission(ibd_state_t *state)
5068 {
5069 	int flag;
5070 	int met_thresh = 0;
5071 	int ret = -1;
5072 
5073 	mutex_enter(&state->id_sched_lock);
5074 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5075 		met_thresh = (state->id_tx_list.dl_cnt >
5076 		    IBD_FREE_SWQES_THRESH);
5077 		flag = IBD_RSRC_SWQE;
5078 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5079 		ASSERT(state->id_lso != NULL);
5080 		met_thresh = (state->id_lso->bkt_nfree >
5081 		    IBD_FREE_LSOS_THRESH);
5082 		flag = IBD_RSRC_LSOBUF;
5083 	}
5084 	if (met_thresh) {
5085 		state->id_sched_needed &= ~flag;
5086 		ret = 0;
5087 	}
5088 	mutex_exit(&state->id_sched_lock);
5089 
5090 	if (ret == 0)
5091 		mac_tx_update(state->id_mh);
5092 
5093 	return (ret);
5094 }
5095 
5096 /*
5097  * Release the send wqe back into free list.
5098  */
5099 static void
5100 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
5101 {
5102 	/*
5103 	 * Add back on Tx list for reuse.
5104 	 */
5105 	swqe->swqe_next = NULL;
5106 	mutex_enter(&state->id_tx_list.dl_mutex);
5107 	if (state->id_tx_list.dl_pending_sends) {
5108 		state->id_tx_list.dl_pending_sends = B_FALSE;
5109 	}
5110 	if (state->id_tx_list.dl_head == NULL) {
5111 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
5112 	} else {
5113 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
5114 	}
5115 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
5116 	state->id_tx_list.dl_cnt++;
5117 	mutex_exit(&state->id_tx_list.dl_mutex);
5118 }
5119 
5120 /*
5121  * Acquire a send wqe from free list.
5122  * Returns error number and send wqe pointer.
5123  */
5124 static int
5125 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
5126 {
5127 	int rc = 0;
5128 	ibd_swqe_t *wqe;
5129 
5130 	/*
5131 	 * Check and reclaim some of the completed Tx requests.
5132 	 * If someone else is already in this code and pulling Tx
5133 	 * completions, no need to poll, since the current lock holder
5134 	 * will do the work anyway. Normally, we poll for completions
5135 	 * every few Tx attempts, but if we are short on Tx descriptors,
5136 	 * we always try to poll.
5137 	 */
5138 	if ((ibd_txcomp_poll == 1) &&
5139 	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
5140 		ibd_poll_compq(state, state->id_scq_hdl);
5141 	}
5142 
5143 	/*
5144 	 * Grab required transmit wqes.
5145 	 */
5146 	mutex_enter(&state->id_tx_list.dl_mutex);
5147 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5148 	if (wqe != NULL) {
5149 		state->id_tx_list.dl_cnt -= 1;
5150 		state->id_tx_list.dl_head = wqe->swqe_next;
5151 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
5152 			state->id_tx_list.dl_tail = NULL;
5153 	} else {
5154 		/*
5155 		 * If we did not find the number we were looking for, flag
5156 		 * no resource. Adjust list appropriately in either case.
5157 		 */
5158 		rc = ENOENT;
5159 		state->id_tx_list.dl_pending_sends = B_TRUE;
5160 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5161 		atomic_add_64(&state->id_tx_short, 1);
5162 	}
5163 	mutex_exit(&state->id_tx_list.dl_mutex);
5164 	*swqe = wqe;
5165 
5166 	return (rc);
5167 }
5168 
5169 static int
5170 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5171     ibt_ud_dest_hdl_t ud_dest)
5172 {
5173 	mblk_t	*nmp;
5174 	int iph_len, tcph_len;
5175 	ibt_wr_lso_t *lso;
5176 	uintptr_t ip_start, tcp_start;
5177 	uint8_t *dst;
5178 	uint_t pending, mblen;
5179 
5180 	/*
5181 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5182 	 * we need to adjust it here for lso.
5183 	 */
5184 	lso = &(node->w_swr.wr.ud_lso);
5185 	lso->lso_ud_dest = ud_dest;
5186 	lso->lso_mss = mss;
5187 
5188 	/*
5189 	 * Calculate the LSO header size and set it in the UD LSO structure.
5190 	 * Note that the only assumption we make is that each of the IPoIB,
5191 	 * IP and TCP headers will be contained in a single mblk fragment;
5192 	 * together, the headers may span multiple mblk fragments.
5193 	 */
5194 	nmp = mp;
5195 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5196 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5197 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5198 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5199 		nmp = nmp->b_cont;
5200 
5201 	}
5202 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5203 
5204 	tcp_start = ip_start + iph_len;
5205 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5206 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5207 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5208 		nmp = nmp->b_cont;
5209 	}
5210 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5211 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5212 
5213 	/*
5214 	 * If the lso header fits entirely within a single mblk fragment,
5215 	 * we'll avoid an additional copy of the lso header here and just
5216 	 * pass the b_rptr of the mblk directly.
5217 	 *
5218 	 * If this isn't true, we'd have to allocate for it explicitly.
5219 	 */
5220 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5221 		lso->lso_hdr = mp->b_rptr;
5222 	} else {
5223 		/* On work completion, remember to free this allocated hdr */
5224 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5225 		if (lso->lso_hdr == NULL) {
5226 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5227 			    "sz = %d", lso->lso_hdr_sz);
5228 			lso->lso_hdr_sz = 0;
5229 			lso->lso_mss = 0;
5230 			return (-1);
5231 		}
5232 	}
5233 
5234 	/*
5235 	 * Copy in the lso header only if we need to
5236 	 */
5237 	if (lso->lso_hdr != mp->b_rptr) {
5238 		dst = lso->lso_hdr;
5239 		pending = lso->lso_hdr_sz;
5240 
5241 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5242 			mblen = MBLKL(nmp);
5243 			if (pending > mblen) {
5244 				bcopy(nmp->b_rptr, dst, mblen);
5245 				dst += mblen;
5246 				pending -= mblen;
5247 			} else {
5248 				bcopy(nmp->b_rptr, dst, pending);
5249 				break;
5250 			}
5251 		}
5252 	}
5253 
5254 	return (0);
5255 }
5256 
5257 static void
5258 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5259 {
5260 	ibt_wr_lso_t *lso;
5261 
5262 	if ((!node) || (!mp))
5263 		return;
5264 
5265 	/*
5266 	 * Free any header space that we might've allocated if we
5267 	 * did an LSO
5268 	 */
5269 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5270 		lso = &(node->w_swr.wr.ud_lso);
5271 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5272 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5273 			lso->lso_hdr = NULL;
5274 			lso->lso_hdr_sz = 0;
5275 		}
5276 	}
5277 }
5278 
5279 static void
5280 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5281 {
5282 	uint_t		i;
5283 	uint_t		num_posted;
5284 	uint_t		n_wrs;
5285 	ibt_status_t	ibt_status;
5286 	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
5287 	ibd_swqe_t	*elem;
5288 	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
5289 
5290 	node->swqe_next = NULL;
5291 
5292 	mutex_enter(&state->id_txpost_lock);
5293 
5294 	/*
5295 	 * Enqueue the new node in chain of wqes to send
5296 	 */
5297 	if (state->id_tx_head) {
5298 		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
5299 	} else {
5300 		state->id_tx_head = node;
5301 	}
5302 	state->id_tx_tailp = &(node->swqe_next);
5303 
5304 	/*
5305 	 * If someone else is helping out with the sends,
5306 	 * just go back
5307 	 */
5308 	if (state->id_tx_busy) {
5309 		mutex_exit(&state->id_txpost_lock);
5310 		return;
5311 	}
5312 
5313 	/*
5314 	 * Otherwise, mark the flag to indicate that we'll be
5315 	 * doing the dispatch of what's there in the wqe chain
5316 	 */
5317 	state->id_tx_busy = 1;
5318 
5319 	while (state->id_tx_head) {
5320 		/*
5321 		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
5322 		 * at a time if possible, and keep posting them.
5323 		 */
5324 		for (n_wrs = 0, elem = state->id_tx_head;
5325 		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
5326 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5327 
5328 			nodes[n_wrs] = elem;
5329 			wrs[n_wrs] = elem->w_swr;
5330 		}
5331 		state->id_tx_head = elem;
5332 
5333 		/*
5334 		 * Release the txpost lock before posting the
5335 		 * send request to the hca; if the posting fails
5336 		 * for some reason, we'll never receive completion
5337 		 * intimation, so we'll need to cleanup.
5338 		 */
5339 		mutex_exit(&state->id_txpost_lock);
5340 
5341 		ASSERT(n_wrs != 0);
5342 
5343 		/*
5344 		 * If posting fails for some reason, we'll never receive
5345 		 * completion intimation, so we'll need to cleanup. But
5346 		 * we need to make sure we don't clean up nodes whose
5347 		 * wrs have been successfully posted. We assume that the
5348 		 * hca driver returns on the first failure to post and
5349 		 * therefore the first 'num_posted' entries don't need
5350 		 * cleanup here.
5351 		 */
5352 		num_posted = 0;
5353 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5354 		    wrs, n_wrs, &num_posted);
5355 		if (ibt_status != IBT_SUCCESS) {
5356 
5357 			ibd_print_warn(state, "ibd_post_send: "
5358 			    "posting multiple wrs failed: "
5359 			    "requested=%d, done=%d, ret=%d",
5360 			    n_wrs, num_posted, ibt_status);
5361 
5362 			for (i = num_posted; i < n_wrs; i++)
5363 				ibd_tx_cleanup(state, nodes[i]);
5364 		}
5365 
5366 		/*
5367 		 * Grab the mutex before we go and check the tx Q again
5368 		 */
5369 		mutex_enter(&state->id_txpost_lock);
5370 	}
5371 
5372 	state->id_tx_busy = 0;
5373 	mutex_exit(&state->id_txpost_lock);
5374 }
5375 
5376 static int
5377 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5378     uint_t lsohdr_sz)
5379 {
5380 	ibt_wr_ds_t *sgl;
5381 	ibt_status_t ibt_status;
5382 	mblk_t *nmp;
5383 	mblk_t *data_mp;
5384 	uchar_t *bufp;
5385 	size_t blksize;
5386 	size_t skip;
5387 	size_t avail;
5388 	uint_t pktsize;
5389 	uint_t frag_len;
5390 	uint_t pending_hdr;
5391 	uint_t hiwm;
5392 	int nmblks;
5393 	int i;
5394 
5395 	/*
5396 	 * Let's skip ahead to the data if this is LSO
5397 	 */
5398 	data_mp = mp;
5399 	pending_hdr = 0;
5400 	if (lsohdr_sz) {
5401 		pending_hdr = lsohdr_sz;
5402 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5403 			frag_len = nmp->b_wptr - nmp->b_rptr;
5404 			if (frag_len > pending_hdr)
5405 				break;
5406 			pending_hdr -= frag_len;
5407 		}
5408 		data_mp = nmp;	/* start of data past lso header */
5409 		ASSERT(data_mp != NULL);
5410 	}
5411 
5412 	/*
5413 	 * Calculate the size of message data and number of msg blocks
5414 	 */
5415 	pktsize = 0;
5416 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5417 	    nmp = nmp->b_cont, nmblks++) {
5418 		pktsize += MBLKL(nmp);
5419 	}
5420 	pktsize -= pending_hdr;
5421 
5422 	/*
5423 	 * Translating the virtual address regions into physical regions
5424 	 * for using the Reserved LKey feature results in a wr sgl that
5425 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
5426 	 * we'll fix a high-water mark (65%) for when we should stop.
5427 	 */
5428 	hiwm = (state->id_max_sqseg * 65) / 100;
5429 
5430 	/*
5431 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5432 	 * "copy-threshold", and if the number of mp fragments is less than
5433 	 * the maximum acceptable.
5434 	 */
5435 	if ((state->id_hca_res_lkey_capab) &&
5436 	    (pktsize > IBD_TX_COPY_THRESH) &&
5437 	    (nmblks < hiwm)) {
5438 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5439 		ibt_iov_attr_t iov_attr;
5440 
5441 		iov_attr.iov_as = NULL;
5442 		iov_attr.iov = iov_arr;
5443 		iov_attr.iov_buf = NULL;
5444 		iov_attr.iov_list_len = nmblks;
5445 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5446 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5447 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5448 
5449 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5450 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5451 			iov_arr[i].iov_len = MBLKL(nmp);
5452 			if (i == 0) {
5453 				iov_arr[i].iov_addr += pending_hdr;
5454 				iov_arr[i].iov_len -= pending_hdr;
5455 			}
5456 		}
5457 
5458 		node->w_buftype = IBD_WQE_MAPPED;
5459 		node->w_swr.wr_sgl = node->w_sgl;
5460 
5461 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5462 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5463 		if (ibt_status != IBT_SUCCESS) {
5464 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5465 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5466 			goto ibd_copy_path;
5467 		}
5468 
5469 		return (0);
5470 	}
5471 
5472 ibd_copy_path:
5473 	if (pktsize <= state->id_tx_buf_sz) {
5474 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5475 		node->w_swr.wr_nds = 1;
5476 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5477 		node->w_buftype = IBD_WQE_TXBUF;
5478 
5479 		/*
5480 		 * Even though this is the copy path for transfers less than
5481 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5482 		 * is possible the first data mblk fragment (data_mp) still
5483 		 * contains part of the LSO header that we need to skip.
5484 		 */
5485 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5486 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5487 			blksize = MBLKL(nmp) - pending_hdr;
5488 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5489 			bufp += blksize;
5490 			pending_hdr = 0;
5491 		}
5492 
5493 		return (0);
5494 	}
5495 
5496 	/*
5497 	 * Copy path for transfers greater than id_tx_buf_sz
5498 	 */
5499 	node->w_swr.wr_sgl = node->w_sgl;
5500 	if (ibd_acquire_lsobufs(state, pktsize,
5501 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5502 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5503 		return (-1);
5504 	}
5505 	node->w_buftype = IBD_WQE_LSOBUF;
5506 
5507 	/*
5508 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5509 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5510 	 * need to skip part of the LSO header in the first fragment
5511 	 * as before.
5512 	 */
5513 	nmp = data_mp;
5514 	skip = pending_hdr;
5515 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5516 		sgl = node->w_swr.wr_sgl + i;
5517 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5518 		avail = IBD_LSO_BUFSZ;
5519 		while (nmp && avail) {
5520 			blksize = MBLKL(nmp) - skip;
5521 			if (blksize > avail) {
5522 				bcopy(nmp->b_rptr + skip, bufp, avail);
5523 				skip += avail;
5524 				avail = 0;
5525 			} else {
5526 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5527 				skip = 0;
5528 				avail -= blksize;
5529 				bufp += blksize;
5530 				nmp = nmp->b_cont;
5531 			}
5532 		}
5533 	}
5534 
5535 	return (0);
5536 }
5537 
5538 /*
5539  * Schedule a completion queue polling to reap the resource we're
5540  * short on.  If we implement the change to reap tx completions
5541  * in a separate thread, we'll need to wake up that thread here.
5542  */
5543 static int
5544 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5545 {
5546 	ibd_req_t *req;
5547 
5548 	mutex_enter(&state->id_sched_lock);
5549 	state->id_sched_needed |= resource_type;
5550 	mutex_exit(&state->id_sched_lock);
5551 
5552 	/*
5553 	 * If we are asked to queue a work entry, we need to do it
5554 	 */
5555 	if (q_flag) {
5556 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5557 		if (req == NULL)
5558 			return (-1);
5559 
5560 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5561 	}
5562 
5563 	return (0);
5564 }
5565 
5566 /*
5567  * The passed in packet has this format:
5568  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5569  */
5570 static boolean_t
5571 ibd_send(ibd_state_t *state, mblk_t *mp)
5572 {
5573 	ibd_ace_t *ace;
5574 	ibd_swqe_t *node;
5575 	ipoib_mac_t *dest;
5576 	ib_header_info_t *ipibp;
5577 	ip6_t *ip6h;
5578 	uint_t pktsize;
5579 	uint32_t mss;
5580 	uint32_t hckflags;
5581 	uint32_t lsoflags = 0;
5582 	uint_t lsohdr_sz = 0;
5583 	int ret, len;
5584 	boolean_t dofree = B_FALSE;
5585 	boolean_t rc;
5586 
5587 	/*
5588 	 * If we aren't done with the device initialization and start,
5589 	 * we shouldn't be here.
5590 	 */
5591 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5592 		return (B_FALSE);
5593 
5594 	node = NULL;
5595 	if (ibd_acquire_swqe(state, &node) != 0) {
5596 		/*
5597 		 * If we don't have an swqe available, schedule a transmit
5598 		 * completion queue cleanup and hold off on sending more
5599 		 * more packets until we have some free swqes
5600 		 */
5601 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
5602 			return (B_FALSE);
5603 
5604 		/*
5605 		 * If a poll cannot be scheduled, we have no choice but
5606 		 * to drop this packet
5607 		 */
5608 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5609 		return (B_TRUE);
5610 	}
5611 
5612 	/*
5613 	 * Initialize the commonly used fields in swqe to NULL to protect
5614 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5615 	 * failure.
5616 	 */
5617 	node->swqe_im_mblk = NULL;
5618 	node->w_swr.wr_nds = 0;
5619 	node->w_swr.wr_sgl = NULL;
5620 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5621 
5622 	/*
5623 	 * Obtain an address handle for the destination.
5624 	 */
5625 	ipibp = (ib_header_info_t *)mp->b_rptr;
5626 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5627 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5628 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5629 
5630 	pktsize = msgsize(mp);
5631 
5632 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5633 	atomic_inc_64(&state->id_xmt_pkt);
5634 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5635 		atomic_inc_64(&state->id_brd_xmt);
5636 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5637 		atomic_inc_64(&state->id_multi_xmt);
5638 
5639 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5640 		node->w_ahandle = ace;
5641 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5642 	} else {
5643 		DPRINT(5,
5644 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5645 		    ((ret == EFAULT) ? "failed" : "queued"),
5646 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5647 		    htonl(dest->ipoib_gidpref[1]),
5648 		    htonl(dest->ipoib_gidsuff[0]),
5649 		    htonl(dest->ipoib_gidsuff[1]));
5650 		node->w_ahandle = NULL;
5651 
5652 		/*
5653 		 * for the poll mode, it is probably some cqe pending in the
5654 		 * cq. So ibd has to poll cq here, otherwise acache probably
5655 		 * may not be recycled.
5656 		 */
5657 		if (ibd_txcomp_poll == 1)
5658 			ibd_poll_compq(state, state->id_scq_hdl);
5659 
5660 		/*
5661 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5662 		 * can not find a path for the specific dest address. We
5663 		 * should get rid of this kind of packet.  We also should get
5664 		 * rid of the packet if we cannot schedule a poll via the
5665 		 * async thread.  For the normal case, ibd will return the
5666 		 * packet to upper layer and wait for AH creating.
5667 		 *
5668 		 * Note that we always queue a work slot entry for the async
5669 		 * thread when we fail AH lookup (even in intr mode); this is
5670 		 * due to the convoluted way the code currently looks for AH.
5671 		 */
5672 		if (ret == EFAULT) {
5673 			dofree = B_TRUE;
5674 			rc = B_TRUE;
5675 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5676 			dofree = B_TRUE;
5677 			rc = B_TRUE;
5678 		} else {
5679 			dofree = B_FALSE;
5680 			rc = B_FALSE;
5681 		}
5682 		goto ibd_send_fail;
5683 	}
5684 
5685 	/*
5686 	 * For ND6 packets, padding is at the front of the source lladdr.
5687 	 * Insert the padding at front.
5688 	 */
5689 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
5690 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5691 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5692 			    sizeof (ib_header_info_t))) {
5693 				DPRINT(10, "ibd_send: pullupmsg failure ");
5694 				dofree = B_TRUE;
5695 				rc = B_TRUE;
5696 				goto ibd_send_fail;
5697 			}
5698 			ipibp = (ib_header_info_t *)mp->b_rptr;
5699 		}
5700 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5701 		    sizeof (ib_header_info_t));
5702 		len = ntohs(ip6h->ip6_plen);
5703 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5704 			mblk_t	*pad;
5705 
5706 			pad = allocb(4, 0);
5707 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5708 			linkb(mp, pad);
5709 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5710 			    IPV6_HDR_LEN + len + 4) {
5711 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5712 				    IPV6_HDR_LEN + len + 4)) {
5713 					DPRINT(10, "ibd_send: pullupmsg "
5714 					    "failure ");
5715 					dofree = B_TRUE;
5716 					rc = B_TRUE;
5717 					goto ibd_send_fail;
5718 				}
5719 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5720 				    sizeof (ib_header_info_t));
5721 			}
5722 
5723 			/* LINTED: E_CONSTANT_CONDITION */
5724 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5725 		}
5726 	}
5727 
5728 	mp->b_rptr += sizeof (ib_addrs_t);
5729 
5730 	/*
5731 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5732 	 * ud destination, the opcode and the LSO header information to the
5733 	 * work request.
5734 	 */
5735 	lso_info_get(mp, &mss, &lsoflags);
5736 	if ((lsoflags & HW_LSO) != HW_LSO) {
5737 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5738 		lsohdr_sz = 0;
5739 	} else {
5740 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5741 			/*
5742 			 * The routine can only fail if there's no memory; we
5743 			 * can only drop the packet if this happens
5744 			 */
5745 			ibd_print_warn(state,
5746 			    "ibd_send: no memory, lso posting failed");
5747 			dofree = B_TRUE;
5748 			rc = B_TRUE;
5749 			goto ibd_send_fail;
5750 		}
5751 
5752 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5753 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5754 	}
5755 
5756 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5757 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5758 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5759 	else
5760 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5761 
5762 	/*
5763 	 * Prepare the sgl for posting; the routine can only fail if there's
5764 	 * no lso buf available for posting. If this is the case, we should
5765 	 * probably resched for lso bufs to become available and then try again.
5766 	 */
5767 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5768 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5769 			dofree = B_TRUE;
5770 			rc = B_TRUE;
5771 		} else {
5772 			dofree = B_FALSE;
5773 			rc = B_FALSE;
5774 		}
5775 		goto ibd_send_fail;
5776 	}
5777 	node->swqe_im_mblk = mp;
5778 
5779 	/*
5780 	 * Queue the wqe to hardware; since we can now simply queue a
5781 	 * post instead of doing it serially, we cannot assume anything
5782 	 * about the 'node' after ibd_post_send() returns.
5783 	 */
5784 	ibd_post_send(state, node);
5785 
5786 	return (B_TRUE);
5787 
5788 ibd_send_fail:
5789 	if (node && mp)
5790 		ibd_free_lsohdr(node, mp);
5791 
5792 	if (dofree)
5793 		freemsg(mp);
5794 
5795 	if (node != NULL)
5796 		ibd_tx_cleanup(state, node);
5797 
5798 	return (rc);
5799 }
5800 
5801 /*
5802  * GLDv3 entry point for transmitting datagram.
5803  */
5804 static mblk_t *
5805 ibd_m_tx(void *arg, mblk_t *mp)
5806 {
5807 	ibd_state_t *state = (ibd_state_t *)arg;
5808 	mblk_t *next;
5809 
5810 	if (state->id_link_state != LINK_STATE_UP) {
5811 		freemsgchain(mp);
5812 		mp = NULL;
5813 	}
5814 
5815 	while (mp != NULL) {
5816 		next = mp->b_next;
5817 		mp->b_next = NULL;
5818 		if (ibd_send(state, mp) == B_FALSE) {
5819 			/* Send fail */
5820 			mp->b_next = next;
5821 			break;
5822 		}
5823 		mp = next;
5824 	}
5825 
5826 	return (mp);
5827 }
5828 
5829 /*
5830  * this handles Tx and Rx completions. With separate CQs, this handles
5831  * only Rx completions.
5832  */
5833 static uint_t
5834 ibd_intr(char *arg)
5835 {
5836 	ibd_state_t *state = (ibd_state_t *)arg;
5837 
5838 	ibd_poll_compq(state, state->id_rcq_hdl);
5839 
5840 	return (DDI_INTR_CLAIMED);
5841 }
5842 
5843 /*
5844  * Poll and drain the cq
5845  */
5846 static uint_t
5847 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
5848     uint_t numwcs)
5849 {
5850 	ibd_wqe_t *wqe;
5851 	ibt_wc_t *wc;
5852 	uint_t total_polled = 0;
5853 	uint_t num_polled;
5854 	int i;
5855 
5856 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5857 		total_polled += num_polled;
5858 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5859 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5860 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5861 			    (wqe->w_type == IBD_WQE_RECV));
5862 			if (wc->wc_status != IBT_WC_SUCCESS) {
5863 				/*
5864 				 * Channel being torn down.
5865 				 */
5866 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5867 					DPRINT(5, "ibd_drain_cq: flush error");
5868 					/*
5869 					 * Only invoke the Tx handler to
5870 					 * release possibly held resources
5871 					 * like AH refcount etc. Can not
5872 					 * invoke Rx handler because it might
5873 					 * try adding buffers to the Rx pool
5874 					 * when we are trying to deinitialize.
5875 					 */
5876 					if (wqe->w_type == IBD_WQE_RECV) {
5877 						continue;
5878 					} else {
5879 						DPRINT(10, "ibd_drain_cq: Bad "
5880 						    "status %d", wc->wc_status);
5881 					}
5882 				}
5883 			}
5884 			if (wqe->w_type == IBD_WQE_SEND) {
5885 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
5886 			} else {
5887 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5888 			}
5889 		}
5890 	}
5891 
5892 	return (total_polled);
5893 }
5894 
5895 /*
5896  * Common code for interrupt handling as well as for polling
5897  * for all completed wqe's while detaching.
5898  */
5899 static void
5900 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5901 {
5902 	ibt_wc_t *wcs;
5903 	uint_t numwcs;
5904 	int flag, redo_flag;
5905 	int redo = 1;
5906 	uint_t num_polled = 0;
5907 
5908 	if (ibd_separate_cqs == 1) {
5909 		if (cq_hdl == state->id_rcq_hdl) {
5910 			flag = IBD_RX_CQ_POLLING;
5911 			redo_flag = IBD_REDO_RX_CQ_POLLING;
5912 		} else {
5913 			flag = IBD_TX_CQ_POLLING;
5914 			redo_flag = IBD_REDO_TX_CQ_POLLING;
5915 		}
5916 	} else {
5917 		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
5918 		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
5919 	}
5920 
5921 	mutex_enter(&state->id_cq_poll_lock);
5922 	if (state->id_cq_poll_busy & flag) {
5923 		state->id_cq_poll_busy |= redo_flag;
5924 		mutex_exit(&state->id_cq_poll_lock);
5925 		return;
5926 	}
5927 	state->id_cq_poll_busy |= flag;
5928 	mutex_exit(&state->id_cq_poll_lock);
5929 
5930 	/*
5931 	 * In some cases (eg detaching), this code can be invoked on
5932 	 * any cpu after disabling cq notification (thus no concurrency
5933 	 * exists). Apart from that, the following applies normally:
5934 	 * The receive completion handling is always on the Rx interrupt
5935 	 * cpu. Transmit completion handling could be from any cpu if
5936 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5937 	 * is interrupt driven. Combined completion handling is always
5938 	 * on the interrupt cpu. Thus, lock accordingly and use the
5939 	 * proper completion array.
5940 	 */
5941 	if (ibd_separate_cqs == 1) {
5942 		if (cq_hdl == state->id_rcq_hdl) {
5943 			wcs = state->id_rxwcs;
5944 			numwcs = state->id_rxwcs_size;
5945 		} else {
5946 			wcs = state->id_txwcs;
5947 			numwcs = state->id_txwcs_size;
5948 		}
5949 	} else {
5950 		wcs = state->id_rxwcs;
5951 		numwcs = state->id_rxwcs_size;
5952 	}
5953 
5954 	/*
5955 	 * Poll and drain the CQ
5956 	 */
5957 	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5958 
5959 	/*
5960 	 * Enable CQ notifications and redrain the cq to catch any
5961 	 * completions we might have missed after the ibd_drain_cq()
5962 	 * above and before the ibt_enable_cq_notify() that follows.
5963 	 * Finally, service any new requests to poll the cq that
5964 	 * could've come in after the ibt_enable_cq_notify().
5965 	 */
5966 	do {
5967 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
5968 		    IBT_SUCCESS) {
5969 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5970 		}
5971 
5972 		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5973 
5974 		mutex_enter(&state->id_cq_poll_lock);
5975 		if (state->id_cq_poll_busy & redo_flag)
5976 			state->id_cq_poll_busy &= ~redo_flag;
5977 		else {
5978 			state->id_cq_poll_busy &= ~flag;
5979 			redo = 0;
5980 		}
5981 		mutex_exit(&state->id_cq_poll_lock);
5982 
5983 	} while (redo);
5984 
5985 	/*
5986 	 * If we polled the receive cq and found anything, we need to flush
5987 	 * it out to the nw layer here.
5988 	 */
5989 	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
5990 		ibd_flush_rx(state, NULL);
5991 	}
5992 }
5993 
5994 /*
5995  * Unmap the memory area associated with a given swqe.
5996  */
5997 static void
5998 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
5999 {
6000 	ibt_status_t stat;
6001 
6002 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6003 
6004 	if (swqe->w_mi_hdl) {
6005 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6006 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6007 			DPRINT(10,
6008 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6009 		}
6010 		swqe->w_mi_hdl = NULL;
6011 	}
6012 	swqe->w_swr.wr_nds = 0;
6013 }
6014 
6015 /*
6016  * Common code that deals with clean ups after a successful or
6017  * erroneous transmission attempt.
6018  */
6019 static void
6020 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6021 {
6022 	ibd_ace_t *ace = swqe->w_ahandle;
6023 
6024 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6025 
6026 	/*
6027 	 * If this was a dynamic mapping in ibd_send(), we need to
6028 	 * unmap here. If this was an lso buffer we'd used for sending,
6029 	 * we need to release the lso buf to the pool, since the resource
6030 	 * is scarce. However, if this was simply a normal send using
6031 	 * the copybuf (present in each swqe), we don't need to release it.
6032 	 */
6033 	if (swqe->swqe_im_mblk != NULL) {
6034 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6035 			ibd_unmap_mem(state, swqe);
6036 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6037 			ibd_release_lsobufs(state,
6038 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6039 		}
6040 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6041 		freemsg(swqe->swqe_im_mblk);
6042 		swqe->swqe_im_mblk = NULL;
6043 	}
6044 
6045 	/*
6046 	 * Drop the reference count on the AH; it can be reused
6047 	 * now for a different destination if there are no more
6048 	 * posted sends that will use it. This can be eliminated
6049 	 * if we can always associate each Tx buffer with an AH.
6050 	 * The ace can be null if we are cleaning up from the
6051 	 * ibd_send() error path.
6052 	 */
6053 	if (ace != NULL) {
6054 		/*
6055 		 * The recycling logic can be eliminated from here
6056 		 * and put into the async thread if we create another
6057 		 * list to hold ACE's for unjoined mcg's.
6058 		 */
6059 		if (DEC_REF_DO_CYCLE(ace)) {
6060 			ibd_mce_t *mce;
6061 
6062 			/*
6063 			 * Check with the lock taken: we decremented
6064 			 * reference count without the lock, and some
6065 			 * transmitter might alreay have bumped the
6066 			 * reference count (possible in case of multicast
6067 			 * disable when we leave the AH on the active
6068 			 * list). If not still 0, get out, leaving the
6069 			 * recycle bit intact.
6070 			 *
6071 			 * Atomically transition the AH from active
6072 			 * to free list, and queue a work request to
6073 			 * leave the group and destroy the mce. No
6074 			 * transmitter can be looking at the AH or
6075 			 * the MCE in between, since we have the
6076 			 * ac_mutex lock. In the SendOnly reap case,
6077 			 * it is not neccesary to hold the ac_mutex
6078 			 * and recheck the ref count (since the AH was
6079 			 * taken off the active list), we just do it
6080 			 * to have uniform processing with the Full
6081 			 * reap case.
6082 			 */
6083 			mutex_enter(&state->id_ac_mutex);
6084 			mce = ace->ac_mce;
6085 			if (GET_REF_CYCLE(ace) == 0) {
6086 				CLEAR_REFCYCLE(ace);
6087 				/*
6088 				 * Identify the case of fullmember reap as
6089 				 * opposed to mcg trap reap. Also, port up
6090 				 * might set ac_mce to NULL to indicate Tx
6091 				 * cleanup should do no more than put the
6092 				 * AH in the free list (see ibd_async_link).
6093 				 */
6094 				if (mce != NULL) {
6095 					ace->ac_mce = NULL;
6096 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6097 					/*
6098 					 * mc_req was initialized at mce
6099 					 * creation time.
6100 					 */
6101 					ibd_queue_work_slot(state,
6102 					    &mce->mc_req, IBD_ASYNC_REAP);
6103 				}
6104 				IBD_ACACHE_INSERT_FREE(state, ace);
6105 			}
6106 			mutex_exit(&state->id_ac_mutex);
6107 		}
6108 	}
6109 
6110 	/*
6111 	 * Release the send wqe for reuse.
6112 	 */
6113 	ibd_release_swqe(state, swqe);
6114 }
6115 
6116 /*
6117  * Hand off the processed rx mp chain to mac_rx()
6118  */
6119 static void
6120 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
6121 {
6122 	if (mpc == NULL) {
6123 		mutex_enter(&state->id_rx_lock);
6124 
6125 		mpc = state->id_rx_mp;
6126 
6127 		state->id_rx_mp = NULL;
6128 		state->id_rx_mp_tail = NULL;
6129 		state->id_rx_mp_len = 0;
6130 
6131 		mutex_exit(&state->id_rx_lock);
6132 	}
6133 
6134 	if (mpc) {
6135 		mac_rx(state->id_mh, state->id_rh, mpc);
6136 	}
6137 }
6138 
6139 /*
6140  * Processing to be done after receipt of a packet; hand off to GLD
6141  * in the format expected by GLD.  The received packet has this
6142  * format: 2b sap :: 00 :: data.
6143  */
6144 static void
6145 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6146 {
6147 	ib_header_info_t *phdr;
6148 	mblk_t *mp;
6149 	mblk_t *mpc = NULL;
6150 	ipoib_hdr_t *ipibp;
6151 	ipha_t *iphap;
6152 	ip6_t *ip6h;
6153 	int rxcnt, len;
6154 
6155 	/*
6156 	 * Track number handed to upper layer, and number still
6157 	 * available to receive packets.
6158 	 */
6159 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
6160 	ASSERT(rxcnt >= 0);
6161 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
6162 
6163 	/*
6164 	 * Adjust write pointer depending on how much data came in.
6165 	 */
6166 	mp = rwqe->rwqe_im_mblk;
6167 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
6168 
6169 	/*
6170 	 * Make sure this is NULL or we're in trouble.
6171 	 */
6172 	if (mp->b_next != NULL) {
6173 		ibd_print_warn(state,
6174 		    "ibd_process_rx: got duplicate mp from rcq?");
6175 		mp->b_next = NULL;
6176 	}
6177 
6178 	/*
6179 	 * the IB link will deliver one of the IB link layer
6180 	 * headers called, the Global Routing Header (GRH).
6181 	 * ibd driver uses the information in GRH to build the
6182 	 * Header_info structure and pass it with the datagram up
6183 	 * to GLDv3.
6184 	 * If the GRH is not valid, indicate to GLDv3 by setting
6185 	 * the VerTcFlow field to 0.
6186 	 */
6187 	phdr = (ib_header_info_t *)mp->b_rptr;
6188 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6189 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6190 
6191 		/* if it is loop back packet, just drop it. */
6192 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6193 		    IPOIB_ADDRL) == 0) {
6194 			freemsg(mp);
6195 			return;
6196 		}
6197 
6198 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6199 		    sizeof (ipoib_mac_t));
6200 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6201 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6202 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6203 		} else {
6204 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6205 		}
6206 	} else {
6207 		/*
6208 		 * It can not be a IBA multicast packet. Must have been
6209 		 * unicast for us. Just copy the interface address to dst.
6210 		 */
6211 		phdr->ib_grh.ipoib_vertcflow = 0;
6212 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6213 		    sizeof (ipoib_mac_t));
6214 	}
6215 
6216 	/*
6217 	 * For ND6 packets, padding is at the front of the source/target
6218 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6219 	 * the padding from such packets.
6220 	 */
6221 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6222 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
6223 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
6224 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6225 			    sizeof (ipoib_hdr_t))) {
6226 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
6227 				freemsg(mp);
6228 				return;
6229 			}
6230 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
6231 			    sizeof (ipoib_pgrh_t));
6232 		}
6233 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6234 		len = ntohs(ip6h->ip6_plen);
6235 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6236 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
6237 			    IPV6_HDR_LEN + len) {
6238 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
6239 				    IPV6_HDR_LEN + len)) {
6240 					DPRINT(10, "ibd_process_rx: pullupmsg"
6241 					    " failed");
6242 					freemsg(mp);
6243 					return;
6244 				}
6245 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6246 				    sizeof (ipoib_pgrh_t) +
6247 				    sizeof (ipoib_hdr_t));
6248 			}
6249 			/* LINTED: E_CONSTANT_CONDITION */
6250 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6251 		}
6252 	}
6253 
6254 	/*
6255 	 * Update statistics
6256 	 */
6257 	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
6258 	atomic_inc_64(&state->id_rcv_pkt);
6259 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6260 		atomic_inc_64(&state->id_brd_rcv);
6261 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6262 		atomic_inc_64(&state->id_multi_rcv);
6263 
6264 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6265 	/*
6266 	 * Set receive checksum status in mp
6267 	 * Hardware checksumming can be considered valid only if:
6268 	 * 1. CQE.IP_OK bit is set
6269 	 * 2. CQE.CKSUM = 0xffff
6270 	 * 3. IPv6 routing header is not present in the packet
6271 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6272 	 */
6273 
6274 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6275 	    (wc->wc_cksum == 0xFFFF) &&
6276 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6277 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6278 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6279 	}
6280 
6281 	/*
6282 	 * Add this mp to the list of processed mp's to send to
6283 	 * the nw layer
6284 	 */
6285 	mutex_enter(&state->id_rx_lock);
6286 	if (state->id_rx_mp) {
6287 		ASSERT(state->id_rx_mp_tail != NULL);
6288 		state->id_rx_mp_tail->b_next = mp;
6289 	} else {
6290 		ASSERT(state->id_rx_mp_tail == NULL);
6291 		state->id_rx_mp = mp;
6292 	}
6293 
6294 	state->id_rx_mp_tail = mp;
6295 	state->id_rx_mp_len++;
6296 
6297 	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
6298 		mpc = state->id_rx_mp;
6299 
6300 		state->id_rx_mp = NULL;
6301 		state->id_rx_mp_tail = NULL;
6302 		state->id_rx_mp_len = 0;
6303 	}
6304 
6305 	mutex_exit(&state->id_rx_lock);
6306 
6307 	if (mpc) {
6308 		ibd_flush_rx(state, mpc);
6309 	}
6310 }
6311 
6312 /*
6313  * Callback code invoked from STREAMs when the receive data buffer is
6314  * free for recycling.
6315  */
6316 static void
6317 ibd_freemsg_cb(char *arg)
6318 {
6319 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6320 	ibd_state_t *state = rwqe->w_state;
6321 
6322 	/*
6323 	 * If the wqe is being destructed, do not attempt recycling.
6324 	 */
6325 	if (rwqe->w_freeing_wqe == B_TRUE) {
6326 		DPRINT(6, "ibd_freemsg: wqe being freed");
6327 		return;
6328 	} else {
6329 		/*
6330 		 * Upper layer has released held mblk, so we have
6331 		 * no more use for keeping the old pointer in
6332 		 * our rwqe.
6333 		 */
6334 		rwqe->rwqe_im_mblk = NULL;
6335 	}
6336 
6337 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6338 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6339 	if (rwqe->rwqe_im_mblk == NULL) {
6340 		ibd_delete_rwqe(state, rwqe);
6341 		ibd_free_rwqe(state, rwqe);
6342 		DPRINT(6, "ibd_freemsg: desballoc failed");
6343 		return;
6344 	}
6345 
6346 	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
6347 		ibd_delete_rwqe(state, rwqe);
6348 		ibd_free_rwqe(state, rwqe);
6349 		return;
6350 	}
6351 
6352 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6353 }
6354 
6355 static uint_t
6356 ibd_tx_recycle(char *arg)
6357 {
6358 	ibd_state_t *state = (ibd_state_t *)arg;
6359 
6360 	/*
6361 	 * Poll for completed entries
6362 	 */
6363 	ibd_poll_compq(state, state->id_scq_hdl);
6364 
6365 	/*
6366 	 * Resume any blocked transmissions if possible
6367 	 */
6368 	(void) ibd_resume_transmission(state);
6369 
6370 	return (DDI_INTR_CLAIMED);
6371 }
6372 
6373 #ifdef IBD_LOGGING
6374 static void
6375 ibd_log_init(void)
6376 {
6377 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6378 	ibd_lbuf_ndx = 0;
6379 
6380 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
6381 }
6382 
6383 static void
6384 ibd_log_fini(void)
6385 {
6386 	if (ibd_lbuf)
6387 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6388 	ibd_lbuf_ndx = 0;
6389 	ibd_lbuf = NULL;
6390 
6391 	mutex_destroy(&ibd_lbuf_lock);
6392 }
6393 
6394 static void
6395 ibd_log(const char *fmt, ...)
6396 {
6397 	va_list	ap;
6398 	uint32_t off;
6399 	uint32_t msglen;
6400 	char tmpbuf[IBD_DMAX_LINE];
6401 
6402 	if (ibd_lbuf == NULL)
6403 		return;
6404 
6405 	va_start(ap, fmt);
6406 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6407 	va_end(ap);
6408 
6409 	if (msglen >= IBD_DMAX_LINE)
6410 		msglen = IBD_DMAX_LINE - 1;
6411 
6412 	mutex_enter(&ibd_lbuf_lock);
6413 
6414 	off = ibd_lbuf_ndx;		/* current msg should go here */
6415 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6416 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6417 
6418 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6419 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6420 
6421 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6422 		ibd_lbuf_ndx = 0;
6423 
6424 	mutex_exit(&ibd_lbuf_lock);
6425 
6426 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6427 }
6428 #endif
6429