xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 6fd12ef379fdceac740caa2565388cb7d7aee547)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * An implementation of the IPoIB standard based on PSARC 2001/289.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/conf.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/modctl.h>
37 #include <sys/stropts.h>
38 #include <sys/stream.h>
39 #include <sys/strsun.h>
40 #include <sys/strsubr.h>
41 #include <sys/dlpi.h>
42 
43 #include <sys/pattr.h>		/* for HCK_PARTIALCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/pattr.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Modes of hardware/driver/software checksum, useful for debugging
69  * and performance studies.
70  *
71  * none: h/w (Tavor) and driver does not do checksum, IP software must.
72  * partial: driver does data checksum, IP must provide psuedo header.
73  * perf_partial: driver uses IP provided psuedo cksum as data checksum
74  *		 (thus, real checksumming is not done).
75  */
76 typedef enum {
77 	IBD_CSUM_NONE,
78 	IBD_CSUM_PARTIAL,
79 	IBD_CSUM_PERF_PARTIAL
80 } ibd_csum_type_t;
81 
82 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t;
83 
84 /*
85  * Per interface tunable parameters.
86  */
87 static uint_t ibd_rx_threshold = 16;
88 static uint_t ibd_tx_current_copy_threshold = 0x10000000;
89 /* should less than max Tavor CQsize and be 2^n - 1 */
90 static uint_t ibd_num_rwqe = 511;
91 static uint_t ibd_num_swqe = 511;
92 static uint_t ibd_num_ah = 16;
93 static uint_t ibd_hash_size = 16;
94 static uint_t ibd_srv_fifos = 0x0;
95 static uint_t ibd_fifo_depth = 0;
96 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE;
97 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE;
98 
99 /*
100  * The driver can use separate CQs for send and receive queueus.
101  * While using separate CQs, it is possible to put the send CQ
102  * in polling mode, ie not to enable notifications on that CQ.
103  * If both CQs are interrupt driven, currently it is not possible
104  * for their handlers to be invoked concurrently (since Tavor ties
105  * both interrupts to the same PCI intr line); but the handlers
106  * are not coded with a single interrupt cpu assumption (eg
107  * id_num_intrs is incremented atomically).
108  *
109  * The driver private struct uses id_scq_hdl to track the separate
110  * CQ being used for send; the id_rcq_hdl tracks the receive CQ
111  * if using separate CQs, or it tracks the single CQ when using
112  * combined CQ. The id_wcs completion array is used in the combined
113  * CQ case, and for fetching Rx completions in the separate CQs case;
114  * the id_txwcs is used to fetch Tx completions in the separate CQs
115  * case.
116  */
117 static uint_t ibd_separate_cqs = 1;
118 static uint_t ibd_txcomp_poll = 0;
119 
120 /*
121  * the softintr is introduced to avoid Event Queue overflow. It
122  * should not have heavy load in CQ event handle function.
123  * If service fifos is enabled, this is not required, because
124  * mac_rx() will be called by service threads.
125  */
126 static uint_t ibd_rx_softintr = 1;
127 static uint_t ibd_tx_softintr = 1;
128 
129 /*
130  * Initial number of IBA resources allocated.
131  */
132 #define	IBD_NUM_RWQE	ibd_num_rwqe
133 #define	IBD_NUM_SWQE	ibd_num_swqe
134 #define	IBD_NUM_AH	ibd_num_ah
135 
136 /* when <= threshold, it's faster to copy to a premapped buffer */
137 #define	IBD_TX_COPY_THRESHOLD	ibd_tx_current_copy_threshold
138 
139 /*
140  * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will
141  * allocate a new WQE to put on the the rxlist. This value must be <=
142  * IBD_NUM_RWQE/id_num_rwqe.
143  */
144 #define	IBD_RX_THRESHOLD	ibd_rx_threshold
145 
146 /*
147  * Hash table size for the active AH list.
148  */
149 #define	IBD_HASH_SIZE	ibd_hash_size
150 
151 #define	IBD_TXPOLL_THRESHOLD 64
152 /*
153  * PAD routine called during send/recv context
154  */
155 #define	IBD_SEND	0
156 #define	IBD_RECV	1
157 
158 /*
159  * fill / clear in <scope> and <p_key> in multicast/broadcast address.
160  */
161 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)			\
162 	{							\
163 		*(uint32_t *)((char *)(maddr) + 4) |=		\
164 		    htonl((uint32_t)(scope) << 16);		\
165 		*(uint32_t *)((char *)(maddr) + 8) |=		\
166 		    htonl((uint32_t)(pkey) << 16);		\
167 	}
168 
169 #define	IBD_CLEAR_SCOPE_PKEY(maddr)				\
170 	{							\
171 		*(uint32_t *)((char *)(maddr) + 4) &=		\
172 		    htonl(~((uint32_t)0xF << 16));		\
173 		*(uint32_t *)((char *)(maddr) + 8) &=		\
174 		    htonl(~((uint32_t)0xFFFF << 16));		\
175 	}
176 
177 /*
178  * when free tx wqes >= threshold and reschedule flag is set,
179  * ibd will call mac_tx_update to re-enable Tx.
180  */
181 #define	IBD_TX_UPDATE_THRESHOLD 1
182 
183 /* Driver State Pointer */
184 void *ibd_list;
185 
186 /* Required system entry points */
187 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
188 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
189 
190 /* Required driver entry points for GLDv3 */
191 static int ibd_m_start(void *);
192 static void ibd_m_stop(void *);
193 static int ibd_m_unicst(void *, const uint8_t *);
194 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
195 static int ibd_m_promisc(void *, boolean_t);
196 static int ibd_m_stat(void *, uint_t, uint64_t *);
197 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
198 static mblk_t *ibd_m_tx(void *, mblk_t *);
199 
200 /* Private driver entry points for GLDv3 */
201 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
202 static uint_t ibd_intr(char *);
203 static uint_t ibd_tx_recycle(char *);
204 static int ibd_state_init(ibd_state_t *, dev_info_t *);
205 static void ibd_state_fini(ibd_state_t *);
206 static int ibd_drv_init(ibd_state_t *);
207 static void ibd_drv_fini(ibd_state_t *);
208 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
209 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
210 static void ibd_snet_notices_handler(void *, ib_gid_t,
211     ibt_subnet_event_code_t, ibt_subnet_event_t *);
212 static int ibd_init_txlist(ibd_state_t *);
213 static void ibd_fini_txlist(ibd_state_t *);
214 static int ibd_init_rxlist(ibd_state_t *);
215 static void ibd_fini_rxlist(ibd_state_t *);
216 static void ibd_freemsg_cb(char *);
217 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
218 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
219 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **);
220 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
221 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
222 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
223 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
224     ibt_async_event_t *);
225 static int ibd_acache_init(ibd_state_t *);
226 static void ibd_acache_fini(ibd_state_t *);
227 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
228 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
229 static void ibd_async_unsetprom(ibd_state_t *);
230 static void ibd_async_setprom(ibd_state_t *);
231 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
232 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
233 static void ibd_async_txsched(ibd_state_t *);
234 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
235 static void ibd_async_work(ibd_state_t *);
236 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
237 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
238 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t);
239 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *,
240     ipoib_mac_t *);
241 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
242 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *);
243 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
244 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
245 static uint64_t ibd_get_portspeed(ibd_state_t *);
246 
247 #ifdef RUN_PERFORMANCE
248 static void ibd_perf(ibd_state_t *);
249 #endif
250 
251 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
252     nodev, NULL, D_MP, NULL);
253 
254 /* Module Driver Info */
255 static struct modldrv ibd_modldrv = {
256 	&mod_driverops,			/* This one is a driver */
257 	"InfiniBand GLDv3 Driver 1.3",	/* short description */
258 	&ibd_dev_ops			/* driver specific ops */
259 };
260 
261 /* Module Linkage */
262 static struct modlinkage ibd_modlinkage = {
263 	MODREV_1, (void *)&ibd_modldrv, NULL
264 };
265 
266 /*
267  * Module Info passed to IBTL during IBT_ATTACH.
268  *   NOTE:  This data must be static (i.e. IBTL just keeps a pointer to this
269  *	    data).
270  */
271 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
272 	IBTI_V2,
273 	IBT_NETWORK,
274 	ibd_async_handler,
275 	NULL,
276 	"IPIB"
277 };
278 
279 /*
280  * Async operation types.
281  */
282 #define	ASYNC_GETAH	1
283 #define	ASYNC_JOIN	2
284 #define	ASYNC_LEAVE	3
285 #define	ASYNC_PROMON	4
286 #define	ASYNC_PROMOFF	5
287 #define	ASYNC_REAP	6
288 #define	ASYNC_TRAP	8
289 #define	ASYNC_SCHED	9
290 #define	ASYNC_LINK	10
291 #define	ASYNC_EXIT	11
292 
293 /*
294  * Async operation states
295  */
296 #define	NOTSTARTED	0
297 #define	ONGOING		1
298 #define	COMPLETED	2
299 #define	ERRORED		3
300 #define	ROUTERED	4
301 
302 #define	IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF
303 
304 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
305 static mac_callbacks_t ib_m_callbacks = {
306 	IBD_M_CALLBACK_FLAGS,
307 	ibd_m_stat,
308 	ibd_m_start,
309 	ibd_m_stop,
310 	ibd_m_promisc,
311 	ibd_m_multicst,
312 	ibd_m_unicst,
313 	ibd_m_tx,
314 	NULL,
315 	NULL,
316 	ibd_m_getcapab
317 };
318 
319 #ifdef DEBUG
320 
321 static int rxpack = 1, txpack = 1;
322 int ibd_debuglevel = 100;
323 static void
324 debug_print(int l, char *fmt, ...)
325 {
326 	va_list ap;
327 
328 	if (l < ibd_debuglevel)
329 		return;
330 	va_start(ap, fmt);
331 	vcmn_err(CE_CONT, fmt, ap);
332 	va_end(ap);
333 }
334 #define	INCRXPACK	(rxpack++)
335 #define	INCTXPACK	(txpack++)
336 #define	DPRINT		debug_print
337 
338 #else /* DEBUG */
339 
340 #define	INCRXPACK	0
341 #define	INCTXPACK	0
342 #define	DPRINT
343 
344 #endif /* DEBUG */
345 
346 /*
347  * Common routine to print warning messages; adds in hca guid, port number
348  * and pkey to be able to identify the IBA interface.
349  */
350 static void
351 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
352 {
353 	ib_guid_t hca_guid;
354 	char ibd_print_buf[256];
355 	int len;
356 	va_list ap;
357 
358 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
359 	    0, "hca-guid", 0);
360 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
361 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
362 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
363 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
364 	va_start(ap, fmt);
365 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
366 	    fmt, ap);
367 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
368 	va_end(ap);
369 }
370 
371 /* warlock directives */
372 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
373     ibd_state_t::id_ah_active))
374 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free))
375 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
376     ibd_state_t::id_req_list))
377 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
378     ibd_state_t::id_acache_req_cv))
379 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
380     ibd_state_t::id_mc_full))
381 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
382     ibd_state_t::id_mc_non))
383 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
384     ibd_state_t::id_link_state))
385 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
386     ibd_state_s::id_tx_list))
387 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
388     ibd_state_s::id_rx_list))
389 
390 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error))
391 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op))
392 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs))
393 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op))
394 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short))
395 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list))
396 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list))
397 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op))
398 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid))
399 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr))
400 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce))
401 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref))
402 
403 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s))
404 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s))
405 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s))
406 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac))
407 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh))
408 
409 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s))
410 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req))
411 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap))
412 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate))
413 
414 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr))
415 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr))
416 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id))
417 
418 #ifdef DEBUG
419 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack))
420 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack))
421 #endif
422 
423 int
424 _init()
425 {
426 	int status;
427 
428 	/*
429 	 * Sanity check some parameter settings. Tx completion polling
430 	 * only makes sense with separate CQs for Tx and Rx.
431 	 */
432 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
433 		cmn_err(CE_NOTE, "!ibd: %s",
434 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
435 		ibd_txcomp_poll = 0;
436 	}
437 
438 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
439 	if (status != 0) {
440 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
441 		return (status);
442 	}
443 
444 	mac_init_ops(&ibd_dev_ops, "ibd");
445 	status = mod_install(&ibd_modlinkage);
446 	if (status != 0) {
447 		DPRINT(10, "_init:failed in mod_install()");
448 		ddi_soft_state_fini(&ibd_list);
449 		mac_fini_ops(&ibd_dev_ops);
450 		return (status);
451 	}
452 
453 	return (0);
454 }
455 
456 int
457 _info(struct modinfo *modinfop)
458 {
459 	return (mod_info(&ibd_modlinkage, modinfop));
460 }
461 
462 int
463 _fini()
464 {
465 	int status;
466 
467 	status = mod_remove(&ibd_modlinkage);
468 	if (status != 0)
469 		return (status);
470 
471 	mac_fini_ops(&ibd_dev_ops);
472 	ddi_soft_state_fini(&ibd_list);
473 	return (0);
474 }
475 
476 /*
477  * Convert the GID part of the mac address from network byte order
478  * to host order.
479  */
480 static void
481 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
482 {
483 	ib_sn_prefix_t nbopref;
484 	ib_guid_t nboguid;
485 
486 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
487 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
488 	dgid->gid_prefix = b2h64(nbopref);
489 	dgid->gid_guid = b2h64(nboguid);
490 }
491 
492 /*
493  * Create the IPoIB address in network byte order from host order inputs.
494  */
495 static void
496 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
497     ib_guid_t guid)
498 {
499 	ib_sn_prefix_t nbopref;
500 	ib_guid_t nboguid;
501 
502 	mac->ipoib_qpn = htonl(qpn);
503 	nbopref = h2b64(prefix);
504 	nboguid = h2b64(guid);
505 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
506 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
507 }
508 
509 /*
510  * Send to the appropriate all-routers group when the IBA multicast group
511  * does not exist, based on whether the target group is v4 or v6.
512  */
513 static boolean_t
514 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
515     ipoib_mac_t *rmac)
516 {
517 	boolean_t retval = B_TRUE;
518 	uint32_t adjscope = state->id_scope << 16;
519 	uint32_t topword;
520 
521 	/*
522 	 * Copy the first 4 bytes in without assuming any alignment of
523 	 * input mac address; this will have IPoIB signature, flags and
524 	 * scope bits.
525 	 */
526 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
527 	topword = ntohl(topword);
528 
529 	/*
530 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
531 	 */
532 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
533 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
534 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
535 		    ((uint32_t)(state->id_pkey << 16))),
536 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
537 	else
538 		/*
539 		 * Does not have proper bits in the mgid address.
540 		 */
541 		retval = B_FALSE;
542 
543 	return (retval);
544 }
545 
546 /*
547  * Implementation of various (software) flavors of send and receive side
548  * checksumming.
549  */
550 #define	IBD_CKSUM_SEND(mp) {						\
551 	uint32_t start, stuff, end, value, flags;			\
552 	uint32_t cksum, sum;						\
553 	uchar_t *dp, *buf;						\
554 	uint16_t *up;							\
555 									\
556 	if (ibd_csum_send == IBD_CSUM_NONE)				\
557 		goto punt_send;						\
558 									\
559 	/*								\
560 	 * Query IP whether Tx cksum needs to be done.			\
561 	 */								\
562 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end,		\
563 	    &value, &flags);						\
564 									\
565 	if (flags == HCK_PARTIALCKSUM)	{				\
566 		dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE);		\
567 		up =  (uint16_t *)(dp + stuff);				\
568 		if (ibd_csum_send == IBD_CSUM_PARTIAL) {		\
569 			end = ((uchar_t *)mp->b_wptr - dp - start);	\
570 			cksum = *up;					\
571 			*up = 0;					\
572 			/*						\
573 			 * Does NOT handle chained mblks/more than one	\
574 			 * SGL. Applicable only for a single SGL	\
575 			 * entry/mblk, where the stuff offset is	\
576 			 * within the range of buf.			\
577 			 */						\
578 			buf = (dp + start);				\
579 			sum = IP_BCSUM_PARTIAL(buf, end, cksum);	\
580 		} else {						\
581 			sum = *up;					\
582 		}							\
583 		DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n",	\
584 		    start, stuff, end, sum, cksum);			\
585 		sum = ~(sum);						\
586 		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
587 	}								\
588 punt_send:								\
589 	;								\
590 }
591 
592 #define	IBD_CKSUM_RECV(mp) {						\
593 	uchar_t *dp, *buf;						\
594 	uint32_t start, end, value, stuff, flags;			\
595 	uint16_t *up, frag;						\
596 	ipha_t *iphp;							\
597 	ipoib_hdr_t *ipibh;						\
598 									\
599 	if (ibd_csum_recv == IBD_CSUM_NONE)				\
600 		goto punt_recv;					 	\
601 									\
602 	ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\
603 	if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP)		 	\
604 		goto punt_recv;						\
605 									\
606 	dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE);			\
607 	iphp = (ipha_t *)dp;						\
608 	frag = ntohs(iphp->ipha_fragment_offset_and_flags);		\
609 	if ((frag) & (~IPH_DF))						\
610 		goto punt_recv;						\
611 	start = IPH_HDR_LENGTH(iphp);					\
612 	if (iphp->ipha_protocol == IPPROTO_TCP)				\
613 		stuff = start + 16;					\
614 	else if (iphp->ipha_protocol == IPPROTO_UDP)			\
615 		stuff = start + 6;					\
616 	else								\
617 		goto punt_recv;						\
618 									\
619 	flags = HCK_PARTIALCKSUM;					\
620 	end = ntohs(iphp->ipha_length);					\
621 	up = (uint16_t *)(dp + stuff);					\
622 									\
623 	if (ibd_csum_recv == IBD_CSUM_PARTIAL) {			\
624 		buf = (dp + start);					\
625 		value = IP_BCSUM_PARTIAL(buf, end - start, 0);		\
626 	} else {							\
627 		value = (*up);						\
628 	}								\
629 	if (hcksum_assoc(mp, NULL, NULL, start, stuff, end,		\
630 	    value, flags, 0) != 0)					\
631 		DPRINT(10, "cksum_recv: value: %x\n", value);		\
632 punt_recv:								\
633 	;								\
634 }
635 
636 /*
637  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
638  * front of optional src/tgt link layer address. Right now Solaris inserts
639  * padding by default at the end. The routine which is doing is nce_xmit()
640  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
641  * the packet comes down from IP layer to the IBD driver, it is in the
642  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
643  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
644  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
645  *
646  * The send routine at IBD driver changes this packet as follows:
647  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
648  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
649  * aligned.
650  *
651  * At the receiving side again ibd_process_rx takes the above packet and
652  * removes the two bytes of front padding and inserts it at the end. This
653  * is since the IP layer does not understand padding at the front.
654  */
655 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
656 	uchar_t 	*nd_lla_ptr;					\
657 	icmp6_t 	*icmp6;						\
658 	nd_opt_hdr_t	*opt;						\
659 	int 		i;						\
660 									\
661 	icmp6 = (icmp6_t *)&ip6h[1];					\
662 	len -= sizeof (nd_neighbor_advert_t);				\
663 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
664 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
665 	    (len != 0)) {						\
666 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
667 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
668 		ASSERT(opt != NULL);					\
669 		nd_lla_ptr = (uchar_t *)&opt[1];			\
670 		if (type == 0) {					\
671 			for (i = IPOIB_ADDRL; i > 0; i--)		\
672 				*(nd_lla_ptr + i + 1) =			\
673 				    *(nd_lla_ptr + i - 1);		\
674 		} else {						\
675 			for (i = 0; i < IPOIB_ADDRL; i++)		\
676 				*(nd_lla_ptr + i) =			\
677 				    *(nd_lla_ptr + i + 2);		\
678 		}							\
679 		*(nd_lla_ptr + i) = 0;					\
680 		*(nd_lla_ptr + i + 1) = 0;				\
681 	}								\
682 }
683 
684 /*
685  * The service fifo code is copied verbatim from Cassini. This can be
686  * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu.
687  */
688 
689 typedef caddr_t fifo_obj_t, *p_fifo_obj_t;
690 
691 typedef struct _srv_fifo_t {
692 	kmutex_t fifo_lock;
693 	kcondvar_t fifo_cv;
694 	size_t size;
695 	uint_t max_index;
696 	uint_t rd_index;
697 	uint_t wr_index;
698 	uint_t objs_pending;
699 	p_fifo_obj_t fifo_objs;
700 	kthread_t *fifo_thread;
701 	void (*drain_func)(caddr_t drain_func_arg);
702 	caddr_t drain_func_arg;
703 	boolean_t running;
704 	callb_cpr_t cprinfo;
705 } srv_fifo_t, *p_srv_fifo_t;
706 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv))
707 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo))
708 
709 static int
710 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size,
711 			void (*drain_func)(), caddr_t drain_func_arg)
712 {
713 	int status;
714 	p_srv_fifo_t srv_fifo;
715 
716 	status = DDI_SUCCESS;
717 	srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP);
718 	srv_fifo->size = size;
719 	srv_fifo->max_index = size - 1;
720 	srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc(
721 	    size * sizeof (fifo_obj_t), KM_SLEEP);
722 	mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL);
723 	cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL);
724 	srv_fifo->drain_func = drain_func;
725 	srv_fifo->drain_func_arg = drain_func_arg;
726 	srv_fifo->running = DDI_SUCCESS;
727 	srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func,
728 	    (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60);
729 	if (srv_fifo->fifo_thread == NULL) {
730 		cv_destroy(&srv_fifo->fifo_cv);
731 		mutex_destroy(&srv_fifo->fifo_lock);
732 		kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t));
733 		kmem_free(srv_fifo, sizeof (srv_fifo_t));
734 		srv_fifo = NULL;
735 		status = DDI_FAILURE;
736 	} else
737 		*handle = srv_fifo;
738 	return (status);
739 }
740 
741 static void
742 _ddi_srv_fifo_destroy(p_srv_fifo_t handle)
743 {
744 	kt_did_t tid = handle->fifo_thread->t_did;
745 
746 	mutex_enter(&handle->fifo_lock);
747 	handle->running = DDI_FAILURE;
748 	cv_signal(&handle->fifo_cv);
749 	while (handle->running == DDI_FAILURE)
750 		cv_wait(&handle->fifo_cv, &handle->fifo_lock);
751 	mutex_exit(&handle->fifo_lock);
752 	if (handle->objs_pending != 0)
753 		cmn_err(CE_NOTE, "!Thread Exit with work undone.");
754 	cv_destroy(&handle->fifo_cv);
755 	mutex_destroy(&handle->fifo_lock);
756 	kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t));
757 	kmem_free(handle, sizeof (srv_fifo_t));
758 	thread_join(tid);
759 }
760 
761 static caddr_t
762 _ddi_srv_fifo_begin(p_srv_fifo_t handle)
763 {
764 #ifndef __lock_lint
765 	CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock,
766 	    callb_generic_cpr, "srv_fifo");
767 #endif /* ! _lock_lint */
768 	return (handle->drain_func_arg);
769 }
770 
771 static void
772 _ddi_srv_fifo_end(p_srv_fifo_t handle)
773 {
774 	callb_cpr_t cprinfo;
775 
776 	mutex_enter(&handle->fifo_lock);
777 	cprinfo = handle->cprinfo;
778 	handle->running = DDI_SUCCESS;
779 	cv_signal(&handle->fifo_cv);
780 #ifndef __lock_lint
781 	CALLB_CPR_EXIT(&cprinfo);
782 #endif /* ! _lock_lint */
783 	thread_exit();
784 	_NOTE(NOT_REACHED)
785 }
786 
787 static int
788 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal)
789 {
790 	int status;
791 
792 	mutex_enter(&handle->fifo_lock);
793 	status = handle->running;
794 	if (status == DDI_SUCCESS) {
795 		if (ptr) {
796 			if (handle->objs_pending < handle->size) {
797 				if (handle->wr_index == handle->max_index)
798 					handle->wr_index = 0;
799 				else
800 					handle->wr_index++;
801 				handle->fifo_objs[handle->wr_index] = ptr;
802 				handle->objs_pending++;
803 			} else
804 				status = DDI_FAILURE;
805 			if (signal)
806 				cv_signal(&handle->fifo_cv);
807 		} else {
808 			if (signal && (handle->objs_pending > 0))
809 				cv_signal(&handle->fifo_cv);
810 		}
811 	}
812 	mutex_exit(&handle->fifo_lock);
813 	return (status);
814 }
815 
816 static int
817 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr)
818 {
819 	int status;
820 
821 	mutex_enter(&handle->fifo_lock);
822 	status = handle->running;
823 	if (status == DDI_SUCCESS) {
824 		if (handle->objs_pending == 0) {
825 #ifndef __lock_lint
826 			CALLB_CPR_SAFE_BEGIN(&handle->cprinfo);
827 			cv_wait(&handle->fifo_cv, &handle->fifo_lock);
828 			CALLB_CPR_SAFE_END(&handle->cprinfo,
829 			    &handle->fifo_lock);
830 #endif /* !_lock_lint */
831 			*ptr = NULL;
832 		}
833 		if (handle->objs_pending > 0) {
834 			if (handle->rd_index == handle->max_index)
835 				handle->rd_index = 0;
836 			else
837 				handle->rd_index++;
838 			*ptr = handle->fifo_objs[handle->rd_index];
839 			handle->objs_pending--;
840 		}
841 		status = handle->running;
842 	} else {
843 		if (handle->objs_pending) {
844 			if (handle->rd_index == handle->max_index)
845 				handle->rd_index = 0;
846 			else
847 				handle->rd_index++;
848 			*ptr = handle->fifo_objs[handle->rd_index];
849 			handle->objs_pending--;
850 			status = DDI_SUCCESS;
851 		} else
852 			status = DDI_FAILURE;
853 	}
854 	mutex_exit(&handle->fifo_lock);
855 	return (status);
856 }
857 
858 /*
859  * [un]map_rx_srv_fifos has been modified from its CE version.
860  */
861 static void
862 drain_fifo(p_srv_fifo_t handle)
863 {
864 	ibd_state_t *state;
865 	mblk_t *mp;
866 
867 	state = (ibd_state_t *)_ddi_srv_fifo_begin(handle);
868 	while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) {
869 		/*
870 		 * Hand off to GLDv3.
871 		 */
872 		IBD_CKSUM_RECV(mp);
873 		mac_rx(state->id_mh, NULL, mp);
874 	}
875 	_ddi_srv_fifo_end(handle);
876 }
877 
878 static p_srv_fifo_t *
879 map_rx_srv_fifos(int *nfifos, void *private)
880 {
881 	p_srv_fifo_t *srv_fifos;
882 	int i, inst_taskqs, depth;
883 
884 	/*
885 	 * Default behavior on both sparc and amd cpus in terms of
886 	 * of worker thread is as follows: (N) indicates worker thread
887 	 * not enabled , (Y) indicates worker thread enabled. Default of
888 	 * ibd_srv_fifo is set to 0xffff. The default behavior can be
889 	 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below.
890 	 * Worker thread model assigns lower priority to network
891 	 * processing making system more usable at higher network
892 	 * loads.
893 	 *  ________________________________________________________
894 	 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff |
895 	 * |----------------------|---|---|-------|---|---|---------|
896 	 * |			  |   Sparc	  |   	x86	    |
897 	 * |----------------------|---|---|-------|---|---|---------|
898 	 * | Single CPU		  |N  | Y | N	  | N | Y | N	    |
899 	 * |----------------------|---|---|-------|---|---|---------|
900 	 * | Multi CPU		  |N  | Y | Y	  | N | Y | Y	    |
901 	 * |______________________|___|___|_______|___|___|_________|
902 	 */
903 	if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) ||
904 	    (ibd_srv_fifos == 0)) {
905 		*nfifos = 0;
906 		return ((p_srv_fifo_t *)1);
907 	}
908 
909 	*nfifos = inst_taskqs;
910 	srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t),
911 	    KM_SLEEP);
912 
913 	/*
914 	 * If the administrator has specified a fifo depth, use
915 	 * that, else just decide what should be the depth.
916 	 */
917 	if (ibd_fifo_depth == 0)
918 		depth = (IBD_NUM_RWQE / inst_taskqs) + 16;
919 	else
920 		depth = ibd_fifo_depth;
921 
922 	for (i = 0; i < inst_taskqs; i++)
923 		if (_ddi_srv_fifo_create(&srv_fifos[i],
924 		    depth, drain_fifo,
925 		    (caddr_t)private) != DDI_SUCCESS)
926 			break;
927 
928 	if (i < inst_taskqs)
929 		goto map_rx_srv_fifos_fail1;
930 
931 	goto map_rx_srv_fifos_exit;
932 
933 map_rx_srv_fifos_fail1:
934 	i--;
935 	for (; i >= 0; i--) {
936 		_ddi_srv_fifo_destroy(srv_fifos[i]);
937 	}
938 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
939 	srv_fifos = NULL;
940 
941 map_rx_srv_fifos_exit:
942 	return (srv_fifos);
943 }
944 
945 static void
946 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos)
947 {
948 	int i;
949 
950 	/*
951 	 * If this interface was not using service fifos, quickly return.
952 	 */
953 	if (inst_taskqs == 0)
954 		return;
955 
956 	for (i = 0; i < inst_taskqs; i++) {
957 		_ddi_srv_fifo_destroy(srv_fifos[i]);
958 	}
959 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
960 }
961 
962 /*
963  * Choose between sending up the packet directly and handing off
964  * to a service thread.
965  */
966 static void
967 ibd_send_up(ibd_state_t *state, mblk_t *mp)
968 {
969 	p_srv_fifo_t *srvfifo;
970 	ipoib_hdr_t *lhdr;
971 	struct ip *ip_hdr;
972 	struct udphdr *tran_hdr;
973 	uchar_t prot;
974 	int tnum = -1, nfifos = state->id_nfifos;
975 
976 	/*
977 	 * Quick path if the interface is not using service fifos.
978 	 */
979 	if (nfifos == 0) {
980 hand_off:
981 		IBD_CKSUM_RECV(mp);
982 		mac_rx(state->id_mh, NULL, mp);
983 		return;
984 	}
985 
986 	/*
987 	 * Is the packet big enough to look at the IPoIB header
988 	 * and basic IP header to determine whether it is an
989 	 * IPv4 packet?
990 	 */
991 	if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE +
992 	    sizeof (struct ip))) {
993 
994 		lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE);
995 
996 		/*
997 		 * Is the packet an IP(v4) packet?
998 		 */
999 		if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) {
1000 
1001 			ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE +
1002 			    IPOIB_HDRSIZE);
1003 			prot = ip_hdr->ip_p;
1004 
1005 			/*
1006 			 * TCP or UDP packet? We use the UDP header, since
1007 			 * the first few words of both headers are laid out
1008 			 * similarly (src/dest ports).
1009 			 */
1010 			if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) {
1011 
1012 				tran_hdr = (struct udphdr *)(
1013 				    (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2));
1014 
1015 				/*
1016 				 * Are we within limits of this packet? If
1017 				 * so, use the destination port to hash to
1018 				 * a service thread.
1019 				 */
1020 				if (mp->b_wptr >= ((uchar_t *)tran_hdr +
1021 				    sizeof (*tran_hdr)))
1022 					tnum = (ntohs(tran_hdr->uh_dport) +
1023 					    ntohs(tran_hdr->uh_sport)) %
1024 					    nfifos;
1025 			}
1026 		}
1027 	}
1028 
1029 	/*
1030 	 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the
1031 	 * packet up in interrupt context, reducing latency.
1032 	 */
1033 	if (tnum == -1) {
1034 		goto hand_off;
1035 	}
1036 
1037 	srvfifo = (p_srv_fifo_t *)state->id_fifos;
1038 	if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp,
1039 	    B_TRUE) != DDI_SUCCESS)
1040 		freemsg(mp);
1041 }
1042 
1043 /*
1044  * Address handle entries maintained by the driver are kept in the
1045  * free and active lists. Each entry starts out in the free list;
1046  * it migrates to the active list when primed using ibt_get_paths()
1047  * and ibt_modify_ud_dest() for transmission to a specific destination.
1048  * In the active list, the entry has a reference count indicating the
1049  * number of ongoing/uncompleted transmits that reference it. The
1050  * entry is left in the active list even after the reference count
1051  * goes to 0, since successive transmits can find it there and do
1052  * not need to set up another entry (ie the path information is
1053  * cached using the active list). Entries on the active list are
1054  * also hashed using the destination link address as a key for faster
1055  * lookups during transmits.
1056  *
1057  * For any destination address (unicast or multicast, whatever the
1058  * join states), there will be at most one entry in the active list.
1059  * Entries with a 0 reference count on the active list can be reused
1060  * for a transmit to a new destination, if the free list is empty.
1061  *
1062  * The AH free list insertion/deletion is protected with the id_ac_mutex,
1063  * since the async thread and Tx callback handlers insert/delete. The
1064  * active list does not need a lock (all operations are done by the
1065  * async thread) but updates to the reference count are atomically
1066  * done (increments done by Tx path, decrements by the Tx callback handler).
1067  */
1068 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
1069 	list_insert_head(&state->id_ah_free, ce)
1070 #define	IBD_ACACHE_GET_FREE(state) \
1071 	list_get_head(&state->id_ah_free)
1072 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
1073 	int _ret_;						\
1074 	list_insert_head(&state->id_ah_active, ce);		\
1075 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
1076 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1077 	ASSERT(_ret_ == 0);					\
1078 }
1079 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
1080 	list_remove(&state->id_ah_active, ce);			\
1081 	(void) mod_hash_remove(state->id_ah_active_hash,	\
1082 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1083 }
1084 #define	IBD_ACACHE_GET_ACTIVE(state) \
1085 	list_get_head(&state->id_ah_active)
1086 
1087 /*
1088  * Membership states for different mcg's are tracked by two lists:
1089  * the "non" list is used for promiscuous mode, when all mcg traffic
1090  * needs to be inspected. This type of membership is never used for
1091  * transmission, so there can not be an AH in the active list
1092  * corresponding to a member in this list. This list does not need
1093  * any protection, since all operations are performed by the async
1094  * thread.
1095  *
1096  * "Full" and "SendOnly" membership is tracked using a single list,
1097  * the "full" list. This is because this single list can then be
1098  * searched during transmit to a multicast group (if an AH for the
1099  * mcg is not found in the active list), since at least one type
1100  * of membership must be present before initiating the transmit.
1101  * This list is also emptied during driver detach, since sendonly
1102  * membership acquired during transmit is dropped at detach time
1103  * alongwith ipv4 broadcast full membership. Insert/deletes to
1104  * this list are done only by the async thread, but it is also
1105  * searched in program context (see multicast disable case), thus
1106  * the id_mc_mutex protects the list. The driver detach path also
1107  * deconstructs the "full" list, but it ensures that the async
1108  * thread will not be accessing the list (by blocking out mcg
1109  * trap handling and making sure no more Tx reaping will happen).
1110  *
1111  * Currently, an IBA attach is done in the SendOnly case too,
1112  * although this is not required.
1113  */
1114 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1115 	list_insert_head(&state->id_mc_full, mce)
1116 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1117 	list_insert_head(&state->id_mc_non, mce)
1118 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1119 	ibd_mcache_find(mgid, &state->id_mc_full)
1120 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1121 	ibd_mcache_find(mgid, &state->id_mc_non)
1122 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1123 	list_remove(&state->id_mc_full, mce)
1124 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1125 	list_remove(&state->id_mc_non, mce)
1126 
1127 /*
1128  * AH and MCE active list manipulation:
1129  *
1130  * Multicast disable requests and MCG delete traps are two cases
1131  * where the active AH entry for the mcg (if any unreferenced one exists)
1132  * will be moved to the free list (to force the next Tx to the mcg to
1133  * join the MCG in SendOnly mode). Port up handling will also move AHs
1134  * from active to free list.
1135  *
1136  * In the case when some transmits are still pending on an entry
1137  * for an mcg, but a multicast disable has already been issued on the
1138  * mcg, there are some options to consider to preserve the join state
1139  * to ensure the emitted packet is properly routed on the IBA fabric.
1140  * For the AH, we can
1141  * 1. take out of active list at multicast disable time.
1142  * 2. take out of active list only when last pending Tx completes.
1143  * For the MCE, we can
1144  * 3. take out of active list at multicast disable time.
1145  * 4. take out of active list only when last pending Tx completes.
1146  * 5. move from active list to stale list at multicast disable time.
1147  * We choose to use 2,4. We use option 4 so that if a multicast enable
1148  * is tried before the pending Tx completes, the enable code finds the
1149  * mce in the active list and just has to make sure it will not be reaped
1150  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1151  * a stale list (#5) that would be checked in the enable code would need
1152  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1153  * after the multicast disable would try to put an AH in the active list,
1154  * and associate the mce it finds in the active list to this new AH,
1155  * whereas the mce is already associated with the previous AH (taken off
1156  * the active list), and will be removed once the pending Tx's complete
1157  * (unless a reference count on mce's is implemented). One implication of
1158  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1159  * grab new references on the AH, further delaying the leave.
1160  *
1161  * In the case of mcg delete (or create) trap when the port is sendonly
1162  * joined, the AH and MCE handling is different: the AH and MCE has to be
1163  * immediately taken off the active lists (forcing a join and path lookup
1164  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1165  * to an mcg as it is repeatedly created and deleted and goes thru
1166  * reincarnations).
1167  *
1168  * When a port is already sendonly joined, and a multicast enable is
1169  * attempted, the same mce structure is promoted; this ensures only a
1170  * single mce on the active list tracks the most powerful join state.
1171  *
1172  * In the case of port up event handling, the MCE for sendonly membership
1173  * is freed up, and the ACE is put into the free list as soon as possible
1174  * (depending on whether posted Tx's have completed). For fullmembership
1175  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1176  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1177  * done; else the mce is deconstructed (mc_fullreap case).
1178  *
1179  * MCG creation and deletion trap handling:
1180  *
1181  * These traps are unreliable (meaning sometimes the trap might never
1182  * be delivered to the subscribed nodes) and may arrive out-of-order
1183  * since they use UD transport. An alternative to relying on these
1184  * unreliable traps is to poll for mcg presence every so often, but
1185  * instead of doing that, we try to be as conservative as possible
1186  * while handling the traps, and hope that the traps do arrive at
1187  * the subscribed nodes soon. Note that if a node is fullmember
1188  * joined to an mcg, it can not possibly receive a mcg create/delete
1189  * trap for that mcg (by fullmember definition); if it does, it is
1190  * an old trap from a previous incarnation of the mcg.
1191  *
1192  * Whenever a trap is received, the driver cleans up its sendonly
1193  * membership to the group; we choose to do a sendonly leave even
1194  * on a creation trap to handle the case of a prior deletion of the mcg
1195  * having gone unnoticed. Consider an example scenario:
1196  * T1: MCG M is deleted, and fires off deletion trap D1.
1197  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1198  * T3: Node N tries to transmit to M, joining in sendonly mode.
1199  * T4: MCG M is deleted, and fires off deletion trap D2.
1200  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1201  *     If the trap is D2, then a LEAVE is not required, since the mcg
1202  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1203  *     approach is to always LEAVE, but the SM may be confused if it
1204  *     receives a LEAVE without a prior JOIN.
1205  *
1206  * Management of the non-membership to an mcg is similar to the above,
1207  * except that if the interface is in promiscuous mode, it is required
1208  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1209  * if the re-join attempt fails (in which case a warning message needs
1210  * to be printed), it is not clear whether it failed due to the mcg not
1211  * existing, or some fabric/hca issues, due to the delayed nature of
1212  * trap delivery. Querying the SA to establish presence/absence of the
1213  * mcg is also racy at best. Thus, the driver just prints a warning
1214  * message when it can not rejoin after receiving a create trap, although
1215  * this might be (on rare occassions) a mis-warning if the create trap is
1216  * received after the mcg was deleted.
1217  */
1218 
1219 /*
1220  * Implementation of atomic "recycle" bits and reference count
1221  * on address handles. This utilizes the fact that max reference
1222  * count on any handle is limited by number of send wqes, thus
1223  * high bits in the ac_ref field can be used as the recycle bits,
1224  * and only the low bits hold the number of pending Tx requests.
1225  * This atomic AH reference counting allows the Tx completion
1226  * handler not to acquire the id_ac_mutex to process every completion,
1227  * thus reducing lock contention problems between completion and
1228  * the Tx path.
1229  */
1230 #define	CYCLEVAL		0x80000
1231 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1232 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1233 #define	GET_REF(ace)		((ace)->ac_ref)
1234 #define	GET_REF_CYCLE(ace) (				\
1235 	/*						\
1236 	 * Make sure "cycle" bit is set.		\
1237 	 */						\
1238 	ASSERT(CYCLE_SET(ace)),				\
1239 	((ace)->ac_ref & ~(CYCLEVAL))			\
1240 )
1241 #define	INC_REF(ace, num) {				\
1242 	atomic_add_32(&(ace)->ac_ref, num);		\
1243 }
1244 #define	SET_CYCLE_IF_REF(ace) (				\
1245 	CYCLE_SET(ace) ? B_TRUE :			\
1246 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1247 		CYCLEVAL ?				\
1248 		/*					\
1249 		 * Clear the "cycle" bit we just set;	\
1250 		 * ref count known to be 0 from above.	\
1251 		 */					\
1252 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1253 		/*					\
1254 		 * We set "cycle" bit; let caller know.	\
1255 		 */					\
1256 		B_TRUE					\
1257 )
1258 #define	DEC_REF_DO_CYCLE(ace) (				\
1259 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1260 	    CYCLEVAL ?					\
1261 		/*					\
1262 		 * Ref count known to be 0 from above.	\
1263 		 */					\
1264 		B_TRUE :				\
1265 		B_FALSE					\
1266 )
1267 
1268 static void *
1269 list_get_head(list_t *list)
1270 {
1271 	list_node_t *lhead = list_head(list);
1272 
1273 	if (lhead != NULL)
1274 		list_remove(list, lhead);
1275 	return (lhead);
1276 }
1277 
1278 /*
1279  * This is always guaranteed to be able to queue the work.
1280  */
1281 static void
1282 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1283 {
1284 	/* Initialize request */
1285 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1286 	ptr->rq_op = op;
1287 
1288 	/*
1289 	 * Queue provided slot onto request pool.
1290 	 */
1291 	mutex_enter(&state->id_acache_req_lock);
1292 	list_insert_tail(&state->id_req_list, ptr);
1293 
1294 	/* Go, fetch, async thread */
1295 	cv_signal(&state->id_acache_req_cv);
1296 	mutex_exit(&state->id_acache_req_lock);
1297 }
1298 
1299 /*
1300  * Main body of the per interface async thread.
1301  */
1302 static void
1303 ibd_async_work(ibd_state_t *state)
1304 {
1305 	ibd_req_t *ptr;
1306 	callb_cpr_t cprinfo;
1307 
1308 	mutex_enter(&state->id_acache_req_lock);
1309 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1310 	    callb_generic_cpr, "ibd_async_work");
1311 	for (;;) {
1312 		ptr = list_get_head(&state->id_req_list);
1313 		if (ptr != NULL) {
1314 			mutex_exit(&state->id_acache_req_lock);
1315 
1316 			/*
1317 			 * Once we have done the operation, there is no
1318 			 * guarantee the request slot is going to be valid,
1319 			 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP).
1320 			 */
1321 
1322 			/* Perform the request */
1323 			switch (ptr->rq_op) {
1324 				case ASYNC_GETAH:
1325 					ibd_async_acache(state, &ptr->rq_mac);
1326 					break;
1327 				case ASYNC_REAP:
1328 					ibd_async_reap_group(state,
1329 					    ptr->rq_ptr, ptr->rq_gid,
1330 					    IB_MC_JSTATE_FULL);
1331 					/*
1332 					 * the req buf contains in mce
1333 					 * structure, so we do not need
1334 					 * to free it here.
1335 					 */
1336 					ptr = NULL;
1337 					break;
1338 				case ASYNC_LEAVE:
1339 				case ASYNC_JOIN:
1340 					ibd_async_multicast(state,
1341 					    ptr->rq_gid, ptr->rq_op);
1342 					break;
1343 				case ASYNC_PROMON:
1344 					ibd_async_setprom(state);
1345 					break;
1346 				case ASYNC_PROMOFF:
1347 					ibd_async_unsetprom(state);
1348 					break;
1349 				case ASYNC_TRAP:
1350 					ibd_async_trap(state, ptr);
1351 					break;
1352 				case ASYNC_SCHED:
1353 					ibd_async_txsched(state);
1354 					break;
1355 				case ASYNC_LINK:
1356 					ibd_async_link(state, ptr);
1357 					break;
1358 				case ASYNC_EXIT:
1359 					mutex_enter(&state->id_acache_req_lock);
1360 #ifndef	__lock_lint
1361 					CALLB_CPR_EXIT(&cprinfo);
1362 #endif /* !__lock_lint */
1363 					return;
1364 			}
1365 			if (ptr != NULL)
1366 				kmem_cache_free(state->id_req_kmc, ptr);
1367 
1368 			mutex_enter(&state->id_acache_req_lock);
1369 		} else {
1370 			/*
1371 			 * Nothing to do: wait till new request arrives.
1372 			 */
1373 #ifndef __lock_lint
1374 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1375 			cv_wait(&state->id_acache_req_cv,
1376 			    &state->id_acache_req_lock);
1377 			CALLB_CPR_SAFE_END(&cprinfo,
1378 			    &state->id_acache_req_lock);
1379 #endif /* !_lock_lint */
1380 		}
1381 	}
1382 	/*NOTREACHED*/
1383 	_NOTE(NOT_REACHED)
1384 }
1385 
1386 /*
1387  * Return when it is safe to queue requests to the async daemon; primarily
1388  * for subnet trap and async event handling. Disallow requests before the
1389  * daemon is created, and when interface deinitilization starts.
1390  */
1391 static boolean_t
1392 ibd_async_safe(ibd_state_t *state)
1393 {
1394 	mutex_enter(&state->id_trap_lock);
1395 	if (state->id_trap_stop) {
1396 		mutex_exit(&state->id_trap_lock);
1397 		return (B_FALSE);
1398 	}
1399 	state->id_trap_inprog++;
1400 	mutex_exit(&state->id_trap_lock);
1401 	return (B_TRUE);
1402 }
1403 
1404 /*
1405  * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
1406  * trap or event handling to complete to kill the async thread and deconstruct
1407  * the mcg/ace list.
1408  */
1409 static void
1410 ibd_async_done(ibd_state_t *state)
1411 {
1412 	mutex_enter(&state->id_trap_lock);
1413 	if (--state->id_trap_inprog == 0)
1414 		cv_signal(&state->id_trap_cv);
1415 	mutex_exit(&state->id_trap_lock);
1416 }
1417 
1418 /*
1419  * Hash functions:
1420  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1421  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1422  * These operate on mac addresses input into ibd_send, but there is no
1423  * guarantee on the alignment of the ipoib_mac_t structure.
1424  */
1425 /*ARGSUSED*/
1426 static uint_t
1427 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1428 {
1429 	ulong_t ptraddr = (ulong_t)key;
1430 	uint_t hval;
1431 
1432 	/*
1433 	 * If the input address is 4 byte aligned, we can just dereference
1434 	 * it. This is most common, since IP will send in a 4 byte aligned
1435 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1436 	 * 4 byte aligned too.
1437 	 */
1438 	if ((ptraddr & 3) == 0)
1439 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1440 
1441 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1442 	return (hval);
1443 }
1444 
1445 static int
1446 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1447 {
1448 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1449 		return (0);
1450 	else
1451 		return (1);
1452 }
1453 
1454 /*
1455  * Initialize all the per interface caches and lists; AH cache,
1456  * MCG list etc.
1457  */
1458 static int
1459 ibd_acache_init(ibd_state_t *state)
1460 {
1461 	ibd_ace_t *ce;
1462 	int i;
1463 
1464 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1465 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1466 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1467 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1468 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1469 	    offsetof(ibd_ace_t, ac_list));
1470 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1471 	    offsetof(ibd_ace_t, ac_list));
1472 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1473 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1474 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1475 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1476 	    offsetof(ibd_mce_t, mc_list));
1477 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1478 	    offsetof(ibd_mce_t, mc_list));
1479 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1480 	    offsetof(ibd_req_t, rq_list));
1481 
1482 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1483 	    IBD_NUM_AH, KM_SLEEP);
1484 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1485 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1486 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1487 			ibd_acache_fini(state);
1488 			return (DDI_FAILURE);
1489 		} else {
1490 			CLEAR_REFCYCLE(ce);
1491 			ce->ac_mce = NULL;
1492 			IBD_ACACHE_INSERT_FREE(state, ce);
1493 		}
1494 	}
1495 	return (DDI_SUCCESS);
1496 }
1497 
1498 static void
1499 ibd_acache_fini(ibd_state_t *state)
1500 {
1501 	ibd_ace_t *ptr;
1502 
1503 	mutex_enter(&state->id_ac_mutex);
1504 
1505 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1506 		ASSERT(GET_REF(ptr) == 0);
1507 		(void) ibt_free_ud_dest(ptr->ac_dest);
1508 	}
1509 
1510 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1511 		ASSERT(GET_REF(ptr) == 0);
1512 		(void) ibt_free_ud_dest(ptr->ac_dest);
1513 	}
1514 
1515 	list_destroy(&state->id_ah_free);
1516 	list_destroy(&state->id_ah_active);
1517 	list_destroy(&state->id_mc_full);
1518 	list_destroy(&state->id_mc_non);
1519 	list_destroy(&state->id_req_list);
1520 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1521 	mutex_exit(&state->id_ac_mutex);
1522 	mutex_destroy(&state->id_ac_mutex);
1523 	mutex_destroy(&state->id_mc_mutex);
1524 	mutex_destroy(&state->id_acache_req_lock);
1525 	cv_destroy(&state->id_acache_req_cv);
1526 }
1527 
1528 /*
1529  * Search AH active hash list for a cached path to input destination.
1530  * If we are "just looking", hold == F. When we are in the Tx path,
1531  * we set hold == T to grab a reference on the AH so that it can not
1532  * be recycled to a new destination while the Tx request is posted.
1533  */
1534 static ibd_ace_t *
1535 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1536 {
1537 	ibd_ace_t *ptr;
1538 
1539 	ASSERT(mutex_owned(&state->id_ac_mutex));
1540 
1541 	/*
1542 	 * Do hash search.
1543 	 */
1544 	if (mod_hash_find(state->id_ah_active_hash,
1545 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1546 		if (hold)
1547 			INC_REF(ptr, num);
1548 		return (ptr);
1549 	}
1550 	return (NULL);
1551 }
1552 
1553 /*
1554  * This is called by the tx side; if an initialized AH is found in
1555  * the active list, it is locked down and can be used; if no entry
1556  * is found, an async request is queued to do path resolution.
1557  */
1558 static ibd_ace_t *
1559 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1560 {
1561 	ibd_ace_t *ptr;
1562 	ibd_req_t *req;
1563 
1564 	/*
1565 	 * Only attempt to print when we can; in the mdt pattr case, the
1566 	 * address is not aligned properly.
1567 	 */
1568 	if (((ulong_t)mac & 3) == 0)
1569 		DPRINT(4,
1570 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1571 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1572 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1573 		    htonl(mac->ipoib_gidsuff[1]));
1574 
1575 	mutex_enter(&state->id_ac_mutex);
1576 
1577 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1578 		mutex_exit(&state->id_ac_mutex);
1579 		return (ptr);
1580 	}
1581 
1582 	/*
1583 	 * Implementation of a single outstanding async request; if
1584 	 * the operation is not started yet, queue a request and move
1585 	 * to ongoing state. Remember in id_ah_addr for which address
1586 	 * we are queueing the request, in case we need to flag an error;
1587 	 * Any further requests, for the same or different address, until
1588 	 * the operation completes, is sent back to GLDv3 to be retried.
1589 	 * The async thread will update id_ah_op with an error indication
1590 	 * or will set it to indicate the next look up can start; either
1591 	 * way, it will mac_tx_update() so that all blocked requests come
1592 	 * back here.
1593 	 */
1594 	*err = EAGAIN;
1595 	if (state->id_ah_op == NOTSTARTED) {
1596 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1597 		if (req != NULL) {
1598 			/*
1599 			 * We did not even find the entry; queue a request
1600 			 * for it.
1601 			 */
1602 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1603 			ibd_queue_work_slot(state, req, ASYNC_GETAH);
1604 			state->id_ah_op = ONGOING;
1605 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1606 		}
1607 	} else if ((state->id_ah_op != ONGOING) &&
1608 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1609 		/*
1610 		 * Check the status of the pathrecord lookup request
1611 		 * we had queued before.
1612 		 */
1613 		if (state->id_ah_op == ERRORED) {
1614 			*err = EFAULT;
1615 			state->id_ah_error++;
1616 		} else {
1617 			/*
1618 			 * ROUTERED case: We need to send to the
1619 			 * all-router MCG. If we can find the AH for
1620 			 * the mcg, the Tx will be attempted. If we
1621 			 * do not find the AH, we return NORESOURCES
1622 			 * to retry.
1623 			 */
1624 			ipoib_mac_t routermac;
1625 
1626 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1627 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1628 			    numwqe);
1629 		}
1630 		state->id_ah_op = NOTSTARTED;
1631 	} else if ((state->id_ah_op != ONGOING) &&
1632 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1633 		/*
1634 		 * This case can happen when we get a higher band
1635 		 * packet. The easiest way is to reset the state machine
1636 		 * to accommodate the higher priority packet.
1637 		 */
1638 		state->id_ah_op = NOTSTARTED;
1639 	}
1640 	mutex_exit(&state->id_ac_mutex);
1641 
1642 	return (ptr);
1643 }
1644 
1645 /*
1646  * Grab a not-currently-in-use AH/PathRecord from the active
1647  * list to recycle to a new destination. Only the async thread
1648  * executes this code.
1649  */
1650 static ibd_ace_t *
1651 ibd_acache_get_unref(ibd_state_t *state)
1652 {
1653 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1654 
1655 	ASSERT(mutex_owned(&state->id_ac_mutex));
1656 
1657 	/*
1658 	 * Do plain linear search.
1659 	 */
1660 	while (ptr != NULL) {
1661 		/*
1662 		 * Note that it is possible that the "cycle" bit
1663 		 * is set on the AH w/o any reference count. The
1664 		 * mcg must have been deleted, and the tx cleanup
1665 		 * just decremented the reference count to 0, but
1666 		 * hasn't gotten around to grabbing the id_ac_mutex
1667 		 * to move the AH into the free list.
1668 		 */
1669 		if (GET_REF(ptr) == 0) {
1670 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1671 			break;
1672 		}
1673 		ptr = list_next(&state->id_ah_active, ptr);
1674 	}
1675 	return (ptr);
1676 }
1677 
1678 /*
1679  * Invoked to clean up AH from active list in case of multicast
1680  * disable and to handle sendonly memberships during mcg traps.
1681  * And for port up processing for multicast and unicast AHs.
1682  * Normally, the AH is taken off the active list, and put into
1683  * the free list to be recycled for a new destination. In case
1684  * Tx requests on the AH have not completed yet, the AH is marked
1685  * for reaping (which will put the AH on the free list) once the Tx's
1686  * complete; in this case, depending on the "force" input, we take
1687  * out the AH from the active list right now, or leave it also for
1688  * the reap operation. Returns TRUE if the AH is taken off the active
1689  * list (and either put into the free list right now, or arranged for
1690  * later), FALSE otherwise.
1691  */
1692 static boolean_t
1693 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1694 {
1695 	ibd_ace_t *acactive;
1696 	boolean_t ret = B_TRUE;
1697 
1698 	ASSERT(mutex_owned(&state->id_ac_mutex));
1699 
1700 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1701 
1702 		/*
1703 		 * Note that the AH might already have the cycle bit set
1704 		 * on it; this might happen if sequences of multicast
1705 		 * enables and disables are coming so fast, that posted
1706 		 * Tx's to the mcg have not completed yet, and the cycle
1707 		 * bit is set successively by each multicast disable.
1708 		 */
1709 		if (SET_CYCLE_IF_REF(acactive)) {
1710 			if (!force) {
1711 				/*
1712 				 * The ace is kept on the active list, further
1713 				 * Tx's can still grab a reference on it; the
1714 				 * ace is reaped when all pending Tx's
1715 				 * referencing the AH complete.
1716 				 */
1717 				ret = B_FALSE;
1718 			} else {
1719 				/*
1720 				 * In the mcg trap case, we always pull the
1721 				 * AH from the active list. And also the port
1722 				 * up multi/unicast case.
1723 				 */
1724 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1725 				acactive->ac_mce = NULL;
1726 			}
1727 		} else {
1728 			/*
1729 			 * Determined the ref count is 0, thus reclaim
1730 			 * immediately after pulling out the ace from
1731 			 * the active list.
1732 			 */
1733 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1734 			acactive->ac_mce = NULL;
1735 			IBD_ACACHE_INSERT_FREE(state, acactive);
1736 		}
1737 
1738 	}
1739 	return (ret);
1740 }
1741 
1742 /*
1743  * Helper function for async path record lookup. If we are trying to
1744  * Tx to a MCG, check our membership, possibly trying to join the
1745  * group if required. If that fails, try to send the packet to the
1746  * all router group (indicated by the redirect output), pointing
1747  * the input mac address to the router mcg address.
1748  */
1749 static ibd_mce_t *
1750 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1751 {
1752 	ib_gid_t mgid;
1753 	ibd_mce_t *mce;
1754 	ipoib_mac_t routermac;
1755 
1756 	*redirect = B_FALSE;
1757 	ibd_n2h_gid(mac, &mgid);
1758 
1759 	/*
1760 	 * Check the FullMember+SendOnlyNonMember list.
1761 	 * Since we are the only one who manipulates the
1762 	 * id_mc_full list, no locks are needed.
1763 	 */
1764 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1765 	if (mce != NULL) {
1766 		DPRINT(4, "ibd_async_mcache : already joined to group");
1767 		return (mce);
1768 	}
1769 
1770 	/*
1771 	 * Not found; try to join(SendOnlyNonMember) and attach.
1772 	 */
1773 	DPRINT(4, "ibd_async_mcache : not joined to group");
1774 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1775 	    NULL) {
1776 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1777 		return (mce);
1778 	}
1779 
1780 	/*
1781 	 * MCGroup not present; try to join the all-router group. If
1782 	 * any of the following steps succeed, we will be redirecting
1783 	 * to the all router group.
1784 	 */
1785 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1786 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1787 		return (NULL);
1788 	*redirect = B_TRUE;
1789 	ibd_n2h_gid(&routermac, &mgid);
1790 	bcopy(&routermac, mac, IPOIB_ADDRL);
1791 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1792 	    mgid.gid_prefix, mgid.gid_guid);
1793 
1794 	/*
1795 	 * Are we already joined to the router group?
1796 	 */
1797 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1798 		DPRINT(4, "ibd_async_mcache : using already joined router"
1799 		    "group\n");
1800 		return (mce);
1801 	}
1802 
1803 	/*
1804 	 * Can we join(SendOnlyNonMember) the router group?
1805 	 */
1806 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1807 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1808 	    NULL) {
1809 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1810 		return (mce);
1811 	}
1812 
1813 	return (NULL);
1814 }
1815 
1816 /*
1817  * Async path record lookup code.
1818  */
1819 static void
1820 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1821 {
1822 	ibd_ace_t *ce;
1823 	ibd_mce_t *mce = NULL;
1824 	ibt_path_attr_t path_attr;
1825 	ibt_path_info_t path_info;
1826 	ib_gid_t destgid;
1827 	int ret = NOTSTARTED;
1828 
1829 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1830 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1831 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1832 	    htonl(mac->ipoib_gidsuff[1]));
1833 
1834 	/*
1835 	 * Check whether we are trying to transmit to a MCG.
1836 	 * In that case, we need to make sure we are a member of
1837 	 * the MCG.
1838 	 */
1839 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1840 		boolean_t redirected;
1841 
1842 		/*
1843 		 * If we can not find or join the group or even
1844 		 * redirect, error out.
1845 		 */
1846 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1847 		    NULL) {
1848 			state->id_ah_op = ERRORED;
1849 			return;
1850 		}
1851 
1852 		/*
1853 		 * If we got redirected, we need to determine whether
1854 		 * the AH for the new mcg is in the cache already, and
1855 		 * not pull it in then; otherwise proceed to get the
1856 		 * path for the new mcg. There is no guarantee that
1857 		 * if the AH is currently in the cache, it will still be
1858 		 * there when we look in ibd_acache_lookup(), but that's
1859 		 * okay, we will come back here.
1860 		 */
1861 		if (redirected) {
1862 			ret = ROUTERED;
1863 			DPRINT(4, "ibd_async_acache :  redirected to "
1864 			    "%08X:%08X:%08X:%08X:%08X",
1865 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1866 			    htonl(mac->ipoib_gidpref[1]),
1867 			    htonl(mac->ipoib_gidsuff[0]),
1868 			    htonl(mac->ipoib_gidsuff[1]));
1869 
1870 			mutex_enter(&state->id_ac_mutex);
1871 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1872 				mutex_exit(&state->id_ac_mutex);
1873 				DPRINT(4, "ibd_async_acache : router AH found");
1874 				state->id_ah_op = ROUTERED;
1875 				return;
1876 			}
1877 			mutex_exit(&state->id_ac_mutex);
1878 		}
1879 	}
1880 
1881 	/*
1882 	 * Get an AH from the free list.
1883 	 */
1884 	mutex_enter(&state->id_ac_mutex);
1885 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1886 		/*
1887 		 * No free ones; try to grab an unreferenced active
1888 		 * one. Maybe we need to make the active list LRU,
1889 		 * but that will create more work for Tx callbacks.
1890 		 * Is there a way of not having to pull out the
1891 		 * entry from the active list, but just indicate it
1892 		 * is being recycled? Yes, but that creates one more
1893 		 * check in the fast lookup path.
1894 		 */
1895 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1896 			/*
1897 			 * Pretty serious shortage now.
1898 			 */
1899 			state->id_ah_op = NOTSTARTED;
1900 			mutex_exit(&state->id_ac_mutex);
1901 			DPRINT(10, "ibd_async_acache : failed to find AH "
1902 			    "slot\n");
1903 			return;
1904 		}
1905 		/*
1906 		 * We could check whether ac_mce points to a SendOnly
1907 		 * member and drop that membership now. Or do it lazily
1908 		 * at detach time.
1909 		 */
1910 		ce->ac_mce = NULL;
1911 	}
1912 	mutex_exit(&state->id_ac_mutex);
1913 	ASSERT(ce->ac_mce == NULL);
1914 
1915 	/*
1916 	 * Update the entry.
1917 	 */
1918 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1919 
1920 	bzero(&path_info, sizeof (path_info));
1921 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1922 	path_attr.pa_sgid = state->id_sgid;
1923 	path_attr.pa_num_dgids = 1;
1924 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1925 	path_attr.pa_dgids = &destgid;
1926 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1927 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1928 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1929 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1930 		goto error;
1931 	}
1932 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1933 	    ntohl(ce->ac_mac.ipoib_qpn),
1934 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1935 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1936 		goto error;
1937 	}
1938 
1939 	/*
1940 	 * mce is set whenever an AH is being associated with a
1941 	 * MCG; this will come in handy when we leave the MCG. The
1942 	 * lock protects Tx fastpath from scanning the active list.
1943 	 */
1944 	if (mce != NULL)
1945 		ce->ac_mce = mce;
1946 	mutex_enter(&state->id_ac_mutex);
1947 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1948 	state->id_ah_op = ret;
1949 	mutex_exit(&state->id_ac_mutex);
1950 	return;
1951 error:
1952 	/*
1953 	 * We might want to drop SendOnly membership here if we
1954 	 * joined above. The lock protects Tx callbacks inserting
1955 	 * into the free list.
1956 	 */
1957 	mutex_enter(&state->id_ac_mutex);
1958 	state->id_ah_op = ERRORED;
1959 	IBD_ACACHE_INSERT_FREE(state, ce);
1960 	mutex_exit(&state->id_ac_mutex);
1961 }
1962 
1963 /*
1964  * While restoring port's presence on the subnet on a port up, it is possible
1965  * that the port goes down again.
1966  */
1967 static void
1968 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1969 {
1970 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1971 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1972 	    LINK_STATE_UP;
1973 	ibd_mce_t *mce, *pmce;
1974 	ibd_ace_t *ace, *pace;
1975 
1976 	DPRINT(10, "ibd_async_link(): %d", opcode);
1977 
1978 	/*
1979 	 * On a link up, revalidate the link speed/width. No point doing
1980 	 * this on a link down, since we will be unable to do SA operations,
1981 	 * defaulting to the lowest speed. Also notice that we update our
1982 	 * notion of speed before calling mac_link_update(), which will do
1983 	 * neccesary higher level notifications for speed changes.
1984 	 */
1985 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1986 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1987 		state->id_link_speed = ibd_get_portspeed(state);
1988 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1989 	}
1990 
1991 	/*
1992 	 * Do all the work required to establish our presence on
1993 	 * the subnet.
1994 	 */
1995 	if (opcode == IBD_LINK_UP_ABSENT) {
1996 		/*
1997 		 * If in promiscuous mode ...
1998 		 */
1999 		if (state->id_prom_op == COMPLETED) {
2000 			/*
2001 			 * Drop all nonmembership.
2002 			 */
2003 			ibd_async_unsetprom(state);
2004 
2005 			/*
2006 			 * Then, try to regain nonmembership to all mcg's.
2007 			 */
2008 			ibd_async_setprom(state);
2009 
2010 		}
2011 
2012 		/*
2013 		 * Drop all sendonly membership (which also gets rid of the
2014 		 * AHs); try to reacquire all full membership.
2015 		 */
2016 		mce = list_head(&state->id_mc_full);
2017 		while ((pmce = mce) != NULL) {
2018 			mce = list_next(&state->id_mc_full, mce);
2019 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
2020 				ibd_leave_group(state,
2021 				    pmce->mc_info.mc_adds_vect.av_dgid,
2022 				    IB_MC_JSTATE_SEND_ONLY_NON);
2023 			else
2024 				ibd_reacquire_group(state, pmce);
2025 		}
2026 
2027 		/*
2028 		 * Recycle all active AHs to free list (and if there are
2029 		 * pending posts, make sure they will go into the free list
2030 		 * once the Tx's complete). Grab the lock to prevent
2031 		 * concurrent Tx's as well as Tx cleanups.
2032 		 */
2033 		mutex_enter(&state->id_ac_mutex);
2034 		ace = list_head(&state->id_ah_active);
2035 		while ((pace = ace) != NULL) {
2036 			boolean_t cycled;
2037 
2038 			ace = list_next(&state->id_ah_active, ace);
2039 			mce = pace->ac_mce;
2040 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
2041 			    B_TRUE);
2042 			/*
2043 			 * If this is for an mcg, it must be for a fullmember,
2044 			 * since we got rid of send-only members above when
2045 			 * processing the mce list.
2046 			 */
2047 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2048 			    IB_MC_JSTATE_FULL)));
2049 
2050 			/*
2051 			 * Check if the fullmember mce needs to be torn down,
2052 			 * ie whether the DLPI disable has already been done.
2053 			 * If so, do some of the work of tx_cleanup, namely
2054 			 * causing leave (which will fail), detach and
2055 			 * mce-freeing. tx_cleanup will put the AH into free
2056 			 * list. The reason to duplicate some of this
2057 			 * tx_cleanup work is because we want to delete the
2058 			 * AH right now instead of waiting for tx_cleanup, to
2059 			 * force subsequent Tx's to reacquire an AH.
2060 			 */
2061 			if ((mce != NULL) && (mce->mc_fullreap))
2062 				ibd_async_reap_group(state, mce,
2063 				    mce->mc_info.mc_adds_vect.av_dgid,
2064 				    mce->mc_jstate);
2065 		}
2066 		mutex_exit(&state->id_ac_mutex);
2067 	}
2068 
2069 	/*
2070 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2071 	 * (which stops further events from being delivered) before
2072 	 * mac_unreigster(). At this point, it is guaranteed that mac_register
2073 	 * has already been done.
2074 	 */
2075 	mutex_enter(&state->id_link_mutex);
2076 	state->id_link_state = lstate;
2077 	mac_link_update(state->id_mh, lstate);
2078 	mutex_exit(&state->id_link_mutex);
2079 
2080 	ibd_async_done(state);
2081 }
2082 
2083 /*
2084  * When the link is notified up, we need to do a few things, based
2085  * on the port's current p_init_type_reply claiming a reinit has been
2086  * done or not. The reinit steps are:
2087  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2088  *    the old Pkey and GID0 are correct.
2089  * 2. Register for mcg traps (already done by ibmf).
2090  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2091  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2092  * 4. Give up all sendonly memberships.
2093  * 5. Acquire all full memberships.
2094  * 6. In promiscuous mode, acquire all non memberships.
2095  * 7. Recycle all AHs to free list.
2096  */
2097 static void
2098 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2099 {
2100 	ibt_hca_portinfo_t *port_infop;
2101 	ibt_status_t ibt_status;
2102 	uint_t psize, port_infosz;
2103 	ibd_link_op_t opcode;
2104 	ibd_req_t *req;
2105 
2106 	/*
2107 	 * Do not send a request to the async daemon if it has not
2108 	 * yet been created or is being destroyed. If the async
2109 	 * daemon has not yet been created, we still need to track
2110 	 * last known state of the link. If this code races with the
2111 	 * detach path, then we are assured that the detach path has
2112 	 * not yet done the ibt_close_hca (which waits for all async
2113 	 * events to complete). If the code races with the attach path,
2114 	 * we need to validate the pkey/gid (in the link_up case) if
2115 	 * the initialization path has already set these up and created
2116 	 * IBTF resources based on the values.
2117 	 */
2118 	mutex_enter(&state->id_link_mutex);
2119 
2120 	/*
2121 	 * If the init code in ibd_drv_init hasn't yet set up the
2122 	 * pkey/gid, nothing to do; that code will set the link state.
2123 	 */
2124 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2125 		mutex_exit(&state->id_link_mutex);
2126 		return;
2127 	}
2128 
2129 	if (code == IBT_EVENT_PORT_UP) {
2130 		uint8_t itreply;
2131 		boolean_t badup = B_FALSE;
2132 
2133 		ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
2134 		    state->id_port, &port_infop, &psize, &port_infosz);
2135 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
2136 			mutex_exit(&state->id_link_mutex);
2137 			DPRINT(10, "ibd_link_up : failed in"
2138 			    " ibt_query_port()\n");
2139 			return;
2140 		}
2141 
2142 		/*
2143 		 * If the link already went down by the time the handler gets
2144 		 * here, give up; we can not even validate pkey/gid since those
2145 		 * are not valid.
2146 		 */
2147 		if (port_infop->p_linkstate != IBT_PORT_ACTIVE)
2148 			badup = B_TRUE;
2149 
2150 		itreply = port_infop->p_init_type_reply;
2151 
2152 		/*
2153 		 * In InitTypeReply, check if NoLoadReply ==
2154 		 * PreserveContentReply == 0, in which case, verify Pkey/GID0.
2155 		 */
2156 		if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2157 		    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
2158 		    (!badup)) {
2159 			/*
2160 			 * Check that the subnet part of GID0 has not changed.
2161 			 */
2162 			if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid,
2163 			    sizeof (ib_gid_t)) != 0)
2164 				badup = B_TRUE;
2165 
2166 			/*
2167 			 * Check that Pkey/index mapping is still valid.
2168 			 */
2169 			if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) ||
2170 			    (port_infop->p_pkey_tbl[state->id_pkix] !=
2171 			    state->id_pkey))
2172 				badup = B_TRUE;
2173 		}
2174 
2175 		/*
2176 		 * In InitTypeReply, if PreservePresenceReply indicates the SM
2177 		 * has ensured that the port's presence in mcg, traps etc is
2178 		 * intact, nothing more to do.
2179 		 */
2180 		opcode = IBD_LINK_UP_ABSENT;
2181 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2182 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY)
2183 			opcode = IBD_LINK_UP;
2184 
2185 		if (badup)
2186 			code = IBT_ERROR_PORT_DOWN;
2187 		ibt_free_portinfo(port_infop, port_infosz);
2188 	}
2189 
2190 	if (!ibd_async_safe(state)) {
2191 		state->id_link_state = ((code == IBT_EVENT_PORT_UP) ?
2192 		    LINK_STATE_UP : LINK_STATE_DOWN);
2193 		mutex_exit(&state->id_link_mutex);
2194 		return;
2195 	}
2196 	mutex_exit(&state->id_link_mutex);
2197 
2198 	if (code == IBT_ERROR_PORT_DOWN)
2199 		opcode = IBD_LINK_DOWN;
2200 
2201 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2202 	req->rq_ptr = (void *)opcode;
2203 	ibd_queue_work_slot(state, req, ASYNC_LINK);
2204 }
2205 
2206 /*
2207  * For the port up/down events, IBTL guarantees there will not be concurrent
2208  * invocations of the handler. IBTL might coalesce link transition events,
2209  * and not invoke the handler for _each_ up/down transition, but it will
2210  * invoke the handler with last known state
2211  */
2212 static void
2213 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2214     ibt_async_code_t code, ibt_async_event_t *event)
2215 {
2216 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2217 
2218 	switch (code) {
2219 	case IBT_ERROR_CATASTROPHIC_CHAN:
2220 		ibd_print_warn(state, "catastrophic channel error");
2221 		break;
2222 	case IBT_ERROR_CQ:
2223 		ibd_print_warn(state, "completion queue error");
2224 		break;
2225 	case IBT_ERROR_PORT_DOWN:
2226 	case IBT_EVENT_PORT_UP:
2227 		/*
2228 		 * Events will be delivered to all instances that have
2229 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2230 		 * Only need to do work for our port; IBTF will deliver
2231 		 * events for other ports on the hca we have ibt_open_hca'ed
2232 		 * too. Note that ibd_drv_init() initializes id_port before
2233 		 * doing ibt_open_hca().
2234 		 */
2235 		ASSERT(state->id_hca_hdl == hca_hdl);
2236 		if (state->id_port != event->ev_port)
2237 			break;
2238 
2239 		ibd_link_mod(state, code);
2240 		break;
2241 
2242 	case IBT_HCA_ATTACH_EVENT:
2243 	case IBT_HCA_DETACH_EVENT:
2244 		/*
2245 		 * When a new card is plugged to the system, attach_event is
2246 		 * invoked. Additionally, a cfgadm needs to be run to make the
2247 		 * card known to the system, and an ifconfig needs to be run to
2248 		 * plumb up any ibd interfaces on the card. In the case of card
2249 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2250 		 * unplumb the ibd interfaces on the card; when the card is
2251 		 * actually unplugged, the detach_event is invoked;
2252 		 * additionally, if any ibd instances are still active on the
2253 		 * card (eg there were no associated RCM scripts), driver's
2254 		 * detach routine is invoked.
2255 		 */
2256 		break;
2257 	default:
2258 		break;
2259 	}
2260 }
2261 
2262 /*
2263  * Attach device to the IO framework.
2264  */
2265 static int
2266 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2267 {
2268 	mac_register_t *macp;
2269 	ibd_state_t *state;
2270 	int instance;
2271 	int err;
2272 
2273 	switch (cmd) {
2274 		case DDI_ATTACH:
2275 			break;
2276 		case DDI_RESUME:
2277 			/* This driver does not support resume */
2278 		default:
2279 			return (DDI_FAILURE);
2280 	}
2281 
2282 	/*
2283 	 * Allocate soft device data structure
2284 	 */
2285 	instance = ddi_get_instance(dip);
2286 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2287 		return (DDI_FAILURE);
2288 	state = ddi_get_soft_state(ibd_list, instance);
2289 
2290 	/* pre ibt_attach() soft state initialization */
2291 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2292 		DPRINT(10, "ibd_attach : failed in ibd_state_init()");
2293 		goto attach_fail_state_init;
2294 	}
2295 
2296 	/* alloc rx soft intr */
2297 	if ((ibd_rx_softintr == 1) &&
2298 	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2299 	    NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) {
2300 		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
2301 		goto attach_fail_ddi_add_rx_softintr;
2302 	}
2303 
2304 	/* alloc tx soft intr */
2305 	if ((ibd_tx_softintr == 1) &&
2306 	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2307 	    NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) {
2308 		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
2309 		goto attach_fail_ddi_add_tx_softintr;
2310 	}
2311 
2312 	/* "attach" to IBTL */
2313 	if (ibt_attach(&ibd_clnt_modinfo, dip, state,
2314 	    &state->id_ibt_hdl) != IBT_SUCCESS) {
2315 		DPRINT(10, "ibd_attach : failed in ibt_attach()");
2316 		goto attach_fail_ibt_attach;
2317 	}
2318 
2319 	/* Finish initializing this driver */
2320 	if (ibd_drv_init(state) != DDI_SUCCESS) {
2321 		DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n");
2322 		goto attach_fail_drv_init;
2323 	}
2324 
2325 	/*
2326 	 * Initialize pointers to device specific functions which will be
2327 	 * used by the generic layer.
2328 	 */
2329 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2330 		DPRINT(10, "ibd_attach : failed in mac_alloc()");
2331 		goto attach_fail_drv_init;
2332 	}
2333 
2334 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2335 	macp->m_driver = state;
2336 	macp->m_dip = state->id_dip;
2337 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2338 	macp->m_callbacks = &ib_m_callbacks;
2339 	macp->m_min_sdu = 0;
2340 	macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE;
2341 
2342 	/*
2343 	 *  Register ourselves with the GLDv3 interface
2344 	 */
2345 	err = mac_register(macp, &state->id_mh);
2346 	mac_free(macp);
2347 	if (err != 0) {
2348 		DPRINT(10, "ibd_attach : failed in mac_register()");
2349 		goto attach_fail_mac_register;
2350 	}
2351 
2352 	/*
2353 	 * Setup the handler we will use for regular DLPI stuff. Its important
2354 	 * to setup the recv handler after registering with gldv3.
2355 	 */
2356 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2357 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
2358 	    IBT_SUCCESS) {
2359 		DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n");
2360 		goto attach_fail_setup_handler;
2361 	}
2362 
2363 	/*
2364 	 * Setup the subnet notices handler after we initialize the a/mcaches
2365 	 * and start the async thread, both of which are required for the
2366 	 * trap handler to function properly. Enable the trap handler to
2367 	 * queue requests to the async thread after the mac_register, because
2368 	 * the async daemon invokes mac_tx_update(), which must be done after
2369 	 * mac_register().
2370 	 */
2371 	ibt_register_subnet_notices(state->id_ibt_hdl,
2372 	    ibd_snet_notices_handler, state);
2373 	mutex_enter(&state->id_trap_lock);
2374 	state->id_trap_stop = B_FALSE;
2375 	mutex_exit(&state->id_trap_lock);
2376 
2377 	/*
2378 	 * Indicate link status to GLDv3 and higher layers. By default,
2379 	 * we assume we are in up state (which must have been true at
2380 	 * least at the time the broadcast mcg's were probed); if there
2381 	 * were any up/down transitions till the time we come here, the
2382 	 * async handler will have updated last known state, which we
2383 	 * use to tell GLDv3. The async handler will not send any
2384 	 * notifications to GLDv3 till we reach here in the initialization
2385 	 * sequence.
2386 	 */
2387 	mac_link_update(state->id_mh, state->id_link_state);
2388 
2389 	return (DDI_SUCCESS);
2390 
2391 	/* Attach failure points, cleanup */
2392 attach_fail_setup_handler:
2393 	(void) mac_unregister(state->id_mh);
2394 
2395 attach_fail_mac_register:
2396 	ibd_drv_fini(state);
2397 
2398 attach_fail_drv_init:
2399 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2400 		ibd_print_warn(state, "failed to free IB resources");
2401 
2402 attach_fail_ibt_attach:
2403 	if (ibd_tx_softintr == 1)
2404 		ddi_remove_softintr(state->id_tx);
2405 
2406 attach_fail_ddi_add_tx_softintr:
2407 	if (ibd_rx_softintr == 1)
2408 		ddi_remove_softintr(state->id_rx);
2409 
2410 attach_fail_ddi_add_rx_softintr:
2411 	ibd_state_fini(state);
2412 
2413 attach_fail_state_init:
2414 	ddi_soft_state_free(ibd_list, instance);
2415 
2416 	return (DDI_FAILURE);
2417 }
2418 
2419 /*
2420  * Detach device from the IO framework.
2421  */
2422 static int
2423 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2424 {
2425 	ibd_state_t *state;
2426 	int status;
2427 	int instance;
2428 
2429 	switch (cmd) {
2430 		case DDI_DETACH:
2431 			break;
2432 		case DDI_SUSPEND:
2433 		default:
2434 			return (DDI_FAILURE);
2435 	}
2436 
2437 	instance = ddi_get_instance(dip);
2438 	state = ddi_get_soft_state(ibd_list, instance);
2439 
2440 	/*
2441 	 * First, stop receive interrupts; this stops the
2442 	 * driver from handing up buffers to higher layers.
2443 	 * Wait for receive buffers to be returned; give up
2444 	 * after 5 seconds.
2445 	 */
2446 	ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
2447 	status = 50;
2448 	while (state->id_rx_list.dl_bufs_outstanding > 0) {
2449 		delay(drv_usectohz(100000));
2450 		if (--status == 0) {
2451 			DPRINT(2, "ibd_detach : reclaiming failed");
2452 			goto failed;
2453 		}
2454 	}
2455 
2456 	if (mac_unregister(state->id_mh) != DDI_SUCCESS) {
2457 		DPRINT(10, "ibd_detach : failed in mac_unregister()");
2458 		goto failed;
2459 	}
2460 
2461 	if (ibd_rx_softintr == 1)
2462 		ddi_remove_softintr(state->id_rx);
2463 
2464 	if (ibd_tx_softintr == 1)
2465 		ddi_remove_softintr(state->id_tx);
2466 
2467 	ibd_drv_fini(state);
2468 
2469 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2470 		ibd_print_warn(state, "failed to free all IB resources at "
2471 		    "driver detach time");
2472 
2473 	ibd_state_fini(state);
2474 	ddi_soft_state_free(ibd_list, instance);
2475 	return (DDI_SUCCESS);
2476 
2477 failed:
2478 	/*
2479 	 * Reap all the Tx/Rx completions that were posted since we
2480 	 * turned off the notification. Turn on notifications. There
2481 	 * is a race in that we do not reap completions that come in
2482 	 * after the poll and before notifications get turned on. That
2483 	 * is okay, the next rx/tx packet will trigger a completion
2484 	 * that will reap any missed completions.
2485 	 */
2486 	ibd_poll_compq(state, state->id_rcq_hdl);
2487 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2488 	return (DDI_FAILURE);
2489 }
2490 
2491 /*
2492  * Pre ibt_attach() driver initialization
2493  */
2494 static int
2495 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2496 {
2497 	char buf[64];
2498 
2499 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2500 	state->id_link_state = LINK_STATE_UNKNOWN;
2501 
2502 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2503 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2504 	state->id_trap_stop = B_TRUE;
2505 	state->id_trap_inprog = 0;
2506 
2507 	mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL);
2508 	state->id_dip = dip;
2509 
2510 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2511 
2512 	state->id_tx_list.dl_head = NULL;
2513 	state->id_tx_list.dl_tail = NULL;
2514 	state->id_tx_list.dl_pending_sends = B_FALSE;
2515 	state->id_tx_list.dl_cnt = 0;
2516 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2517 
2518 	state->id_rx_list.dl_head = NULL;
2519 	state->id_rx_list.dl_tail = NULL;
2520 	state->id_rx_list.dl_bufs_outstanding = 0;
2521 	state->id_rx_list.dl_cnt = 0;
2522 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2523 	mutex_init(&state->id_rx_mutex, NULL, MUTEX_DRIVER, NULL);
2524 
2525 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2526 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2527 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2528 
2529 	return (DDI_SUCCESS);
2530 }
2531 
2532 /*
2533  * Post ibt_detach() driver deconstruction
2534  */
2535 static void
2536 ibd_state_fini(ibd_state_t *state)
2537 {
2538 	mutex_destroy(&state->id_tx_list.dl_mutex);
2539 	mutex_destroy(&state->id_rx_list.dl_mutex);
2540 	mutex_destroy(&state->id_rx_mutex);
2541 	mutex_destroy(&state->id_sched_lock);
2542 	mutex_destroy(&state->id_txcomp_lock);
2543 
2544 	cv_destroy(&state->id_trap_cv);
2545 	mutex_destroy(&state->id_trap_lock);
2546 	mutex_destroy(&state->id_link_mutex);
2547 	kmem_cache_destroy(state->id_req_kmc);
2548 }
2549 
2550 /*
2551  * Fetch IBA parameters for the network device from IB nexus.
2552  */
2553 static int
2554 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid)
2555 {
2556 	/*
2557 	 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
2558 	 * Note that the default partition is also allowed.
2559 	 */
2560 	state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2561 	    0, "port-pkey", IB_PKEY_INVALID_LIMITED);
2562 	if (state->id_pkey <= IB_PKEY_INVALID_FULL) {
2563 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong"
2564 		    "partition\n");
2565 		return (DDI_FAILURE);
2566 	}
2567 
2568 	/*
2569 	 * ... the IBA port ...
2570 	 */
2571 	state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2572 	    0, "port-number", 0);
2573 	if (state->id_port == 0) {
2574 		DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n");
2575 		return (DDI_FAILURE);
2576 	}
2577 
2578 	/*
2579 	 * ... and HCA GUID.
2580 	 */
2581 	*hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
2582 	    0, "hca-guid", 0);
2583 	if (*hca_guid == 0) {
2584 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong "
2585 		    "guid\n");
2586 		return (DDI_FAILURE);
2587 	}
2588 
2589 	return (DDI_SUCCESS);
2590 }
2591 
2592 /*
2593  * Fetch link speed from SA for snmp ifspeed reporting.
2594  */
2595 static uint64_t
2596 ibd_get_portspeed(ibd_state_t *state)
2597 {
2598 	int			ret;
2599 	ibt_path_info_t		path;
2600 	ibt_path_attr_t		path_attr;
2601 	uint8_t			num_paths;
2602 	uint64_t		ifspeed;
2603 
2604 	/*
2605 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2606 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2607 	 * 2000000000. Start with that as default.
2608 	 */
2609 	ifspeed = 2000000000;
2610 
2611 	bzero(&path_attr, sizeof (path_attr));
2612 
2613 	/*
2614 	 * Get the port speed from Loopback path information.
2615 	 */
2616 	path_attr.pa_dgids = &state->id_sgid;
2617 	path_attr.pa_num_dgids = 1;
2618 	path_attr.pa_sgid = state->id_sgid;
2619 
2620 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2621 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2622 		goto earlydone;
2623 
2624 	if (num_paths < 1)
2625 		goto earlydone;
2626 
2627 	/*
2628 	 * In case SA does not return an expected value, report the default
2629 	 * speed as 1X.
2630 	 */
2631 	ret = 1;
2632 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2633 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2634 			ret = 1;
2635 			break;
2636 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2637 			ret = 4;
2638 			break;
2639 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2640 			ret = 12;
2641 			break;
2642 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2643 			ret = 2;
2644 			break;
2645 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2646 			ret = 8;
2647 			break;
2648 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2649 			ret = 16;
2650 			break;
2651 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2652 			ret = 24;
2653 			break;
2654 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2655 			ret = 32;
2656 			break;
2657 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2658 			ret = 48;
2659 			break;
2660 	}
2661 
2662 	ifspeed *= ret;
2663 
2664 earlydone:
2665 	return (ifspeed);
2666 }
2667 
2668 /*
2669  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2670  * representing the input mcg mgid.
2671  */
2672 static ibd_mce_t *
2673 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2674 {
2675 	ibd_mce_t *ptr = list_head(mlist);
2676 
2677 	/*
2678 	 * Do plain linear search.
2679 	 */
2680 	while (ptr != NULL) {
2681 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2682 		    sizeof (ib_gid_t)) == 0)
2683 			return (ptr);
2684 		ptr = list_next(mlist, ptr);
2685 	}
2686 	return (NULL);
2687 }
2688 
2689 /*
2690  * Execute IBA JOIN.
2691  */
2692 static ibt_status_t
2693 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2694 {
2695 	ibt_mcg_attr_t mcg_attr;
2696 
2697 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2698 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2699 	mcg_attr.mc_mgid = mgid;
2700 	mcg_attr.mc_join_state = mce->mc_jstate;
2701 	mcg_attr.mc_scope = state->id_scope;
2702 	mcg_attr.mc_pkey = state->id_pkey;
2703 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2704 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2705 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2706 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2707 	    NULL, NULL));
2708 }
2709 
2710 /*
2711  * This code JOINs the port in the proper way (depending on the join
2712  * state) so that IBA fabric will forward mcg packets to/from the port.
2713  * It also attaches the QPN to the mcg so it can receive those mcg
2714  * packets. This code makes sure not to attach the mcg to the QP if
2715  * that has been previously done due to the mcg being joined with a
2716  * different join state, even though this is not required by SWG_0216,
2717  * refid 3610.
2718  */
2719 static ibd_mce_t *
2720 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2721 {
2722 	ibt_status_t ibt_status;
2723 	ibd_mce_t *mce, *tmce, *omce = NULL;
2724 	boolean_t do_attach = B_TRUE;
2725 
2726 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2727 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2728 
2729 	/*
2730 	 * For enable_multicast Full member joins, we need to do some
2731 	 * extra work. If there is already an mce on the list that
2732 	 * indicates full membership, that means the membership has
2733 	 * not yet been dropped (since the disable_multicast was issued)
2734 	 * because there are pending Tx's to the mcg; in that case, just
2735 	 * mark the mce not to be reaped when the Tx completion queues
2736 	 * an async reap operation.
2737 	 *
2738 	 * If there is already an mce on the list indicating sendonly
2739 	 * membership, try to promote to full membership. Be careful
2740 	 * not to deallocate the old mce, since there might be an AH
2741 	 * pointing to it; instead, update the old mce with new data
2742 	 * that tracks the full membership.
2743 	 */
2744 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2745 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2746 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2747 			ASSERT(omce->mc_fullreap);
2748 			omce->mc_fullreap = B_FALSE;
2749 			return (omce);
2750 		} else {
2751 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2752 		}
2753 	}
2754 
2755 	/*
2756 	 * Allocate the ibd_mce_t to track this JOIN.
2757 	 */
2758 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2759 	mce->mc_fullreap = B_FALSE;
2760 	mce->mc_jstate = jstate;
2761 
2762 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2763 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2764 		    ibt_status);
2765 		kmem_free(mce, sizeof (ibd_mce_t));
2766 		return (NULL);
2767 	}
2768 
2769 	/*
2770 	 * Is an IBA attach required? Not if the interface is already joined
2771 	 * to the mcg in a different appropriate join state.
2772 	 */
2773 	if (jstate == IB_MC_JSTATE_NON) {
2774 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2775 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2776 			do_attach = B_FALSE;
2777 	} else if (jstate == IB_MC_JSTATE_FULL) {
2778 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2779 			do_attach = B_FALSE;
2780 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2781 		do_attach = B_FALSE;
2782 	}
2783 
2784 	if (do_attach) {
2785 		/*
2786 		 * Do the IBA attach.
2787 		 */
2788 		DPRINT(10, "ibd_join_group : ibt_attach_mcg \n");
2789 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2790 		    &mce->mc_info)) != IBT_SUCCESS) {
2791 			DPRINT(10, "ibd_join_group : failed qp attachment "
2792 			    "%d\n", ibt_status);
2793 			/*
2794 			 * NOTE that we should probably preserve the join info
2795 			 * in the list and later try to leave again at detach
2796 			 * time.
2797 			 */
2798 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2799 			    state->id_sgid, jstate);
2800 			kmem_free(mce, sizeof (ibd_mce_t));
2801 			return (NULL);
2802 		}
2803 	}
2804 
2805 	/*
2806 	 * Insert the ibd_mce_t in the proper list.
2807 	 */
2808 	if (jstate == IB_MC_JSTATE_NON) {
2809 		IBD_MCACHE_INSERT_NON(state, mce);
2810 	} else {
2811 		/*
2812 		 * Set up the mc_req fields used for reaping the
2813 		 * mcg in case of delayed tx completion (see
2814 		 * ibd_tx_cleanup()). Also done for sendonly join in
2815 		 * case we are promoted to fullmembership later and
2816 		 * keep using the same mce.
2817 		 */
2818 		mce->mc_req.rq_gid = mgid;
2819 		mce->mc_req.rq_ptr = mce;
2820 		/*
2821 		 * Check whether this is the case of trying to join
2822 		 * full member, and we were already joined send only.
2823 		 * We try to drop our SendOnly membership, but it is
2824 		 * possible that the mcg does not exist anymore (and
2825 		 * the subnet trap never reached us), so the leave
2826 		 * operation might fail.
2827 		 */
2828 		if (omce != NULL) {
2829 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2830 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2831 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2832 			bcopy(&mce->mc_info, &omce->mc_info,
2833 			    sizeof (ibt_mcg_info_t));
2834 			kmem_free(mce, sizeof (ibd_mce_t));
2835 			return (omce);
2836 		}
2837 		mutex_enter(&state->id_mc_mutex);
2838 		IBD_MCACHE_INSERT_FULL(state, mce);
2839 		mutex_exit(&state->id_mc_mutex);
2840 	}
2841 
2842 	return (mce);
2843 }
2844 
2845 /*
2846  * Called during port up event handling to attempt to reacquire full
2847  * membership to an mcg. Stripped down version of ibd_join_group().
2848  * Note that it is possible that the mcg might have gone away, and
2849  * gets recreated at this point.
2850  */
2851 static void
2852 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2853 {
2854 	ib_gid_t mgid;
2855 
2856 	/*
2857 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2858 	 * reap/leave is going to try to leave the group. We could prevent
2859 	 * that by adding a boolean flag into ibd_mce_t, if required.
2860 	 */
2861 	if (mce->mc_fullreap)
2862 		return;
2863 
2864 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2865 
2866 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2867 	    mgid.gid_guid);
2868 
2869 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2870 		ibd_print_warn(state, "Failure on port up to rejoin "
2871 		    "multicast gid %016llx:%016llx",
2872 		    (u_longlong_t)mgid.gid_prefix,
2873 		    (u_longlong_t)mgid.gid_guid);
2874 }
2875 
2876 /*
2877  * This code handles delayed Tx completion cleanups for mcg's to which
2878  * disable_multicast has been issued, regular mcg related cleanups during
2879  * disable_multicast, disable_promiscous and mcg traps, as well as
2880  * cleanups during driver detach time. Depending on the join state,
2881  * it deletes the mce from the appropriate list and issues the IBA
2882  * leave/detach; except in the disable_multicast case when the mce
2883  * is left on the active list for a subsequent Tx completion cleanup.
2884  */
2885 static void
2886 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2887     uint8_t jstate)
2888 {
2889 	ibd_mce_t *tmce;
2890 	boolean_t do_detach = B_TRUE;
2891 
2892 	/*
2893 	 * Before detaching, we must check whether the other list
2894 	 * contains the mcg; if we detach blindly, the consumer
2895 	 * who set up the other list will also stop receiving
2896 	 * traffic.
2897 	 */
2898 	if (jstate == IB_MC_JSTATE_FULL) {
2899 		/*
2900 		 * The following check is only relevant while coming
2901 		 * from the Tx completion path in the reap case.
2902 		 */
2903 		if (!mce->mc_fullreap)
2904 			return;
2905 		mutex_enter(&state->id_mc_mutex);
2906 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2907 		mutex_exit(&state->id_mc_mutex);
2908 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2909 			do_detach = B_FALSE;
2910 	} else if (jstate == IB_MC_JSTATE_NON) {
2911 		IBD_MCACHE_PULLOUT_NON(state, mce);
2912 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2913 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2914 			do_detach = B_FALSE;
2915 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2916 		mutex_enter(&state->id_mc_mutex);
2917 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2918 		mutex_exit(&state->id_mc_mutex);
2919 		do_detach = B_FALSE;
2920 	}
2921 
2922 	/*
2923 	 * If we are reacting to a mcg trap and leaving our sendonly or
2924 	 * non membership, the mcg is possibly already gone, so attempting
2925 	 * to leave might fail. On the other hand, we must try to leave
2926 	 * anyway, since this might be a trap from long ago, and we could
2927 	 * have potentially sendonly joined to a recent incarnation of
2928 	 * the mcg and are about to loose track of this information.
2929 	 */
2930 	if (do_detach) {
2931 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2932 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2933 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2934 	}
2935 
2936 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2937 	kmem_free(mce, sizeof (ibd_mce_t));
2938 }
2939 
2940 /*
2941  * Async code executed due to multicast and promiscuous disable requests
2942  * and mcg trap handling; also executed during driver detach. Mostly, a
2943  * leave and detach is done; except for the fullmember case when Tx
2944  * requests are pending, whence arrangements are made for subsequent
2945  * cleanup on Tx completion.
2946  */
2947 static void
2948 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2949 {
2950 	ipoib_mac_t mcmac;
2951 	boolean_t recycled;
2952 	ibd_mce_t *mce;
2953 
2954 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
2955 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2956 
2957 	if (jstate == IB_MC_JSTATE_NON) {
2958 		recycled = B_TRUE;
2959 		mce = IBD_MCACHE_FIND_NON(state, mgid);
2960 		/*
2961 		 * In case we are handling a mcg trap, we might not find
2962 		 * the mcg in the non list.
2963 		 */
2964 		if (mce == NULL)
2965 			return;
2966 	} else {
2967 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
2968 
2969 		/*
2970 		 * In case we are handling a mcg trap, make sure the trap
2971 		 * is not arriving late; if we have an mce that indicates
2972 		 * that we are already a fullmember, that would be a clear
2973 		 * indication that the trap arrived late (ie, is for a
2974 		 * previous incarnation of the mcg).
2975 		 */
2976 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
2977 			if ((mce == NULL) || (mce->mc_jstate ==
2978 			    IB_MC_JSTATE_FULL))
2979 				return;
2980 			ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2981 		} else {
2982 			ASSERT(jstate == IB_MC_JSTATE_FULL);
2983 			ASSERT(mce->mc_jstate == IB_MC_JSTATE_FULL);
2984 
2985 			/*
2986 			 * If join group failed, mce will be NULL here.
2987 			 * This is because in GLDv3 driver, set multicast
2988 			 *  will always return success.
2989 			 */
2990 			if (mce == NULL)
2991 				return;
2992 			mce->mc_fullreap = B_TRUE;
2993 		}
2994 
2995 		/*
2996 		 * If no pending Tx's remain that reference the AH
2997 		 * for the mcg, recycle it from active to free list.
2998 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
2999 		 * so the last completing Tx will cause an async reap
3000 		 * operation to be invoked, at which time we will drop our
3001 		 * membership to the mcg so that the pending Tx's complete
3002 		 * successfully. Refer to comments on "AH and MCE active
3003 		 * list manipulation" at top of this file. The lock protects
3004 		 * against Tx fast path and Tx cleanup code.
3005 		 */
3006 		mutex_enter(&state->id_ac_mutex);
3007 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3008 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3009 		    IB_MC_JSTATE_SEND_ONLY_NON));
3010 		mutex_exit(&state->id_ac_mutex);
3011 	}
3012 
3013 	if (recycled) {
3014 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3015 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3016 		ibd_async_reap_group(state, mce, mgid, jstate);
3017 	}
3018 }
3019 
3020 /*
3021  * Find the broadcast address as defined by IPoIB; implicitly
3022  * determines the IBA scope, mtu, tclass etc of the link the
3023  * interface is going to be a member of.
3024  */
3025 static ibt_status_t
3026 ibd_find_bgroup(ibd_state_t *state)
3027 {
3028 	ibt_mcg_attr_t mcg_attr;
3029 	uint_t numg;
3030 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3031 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3032 	    IB_MC_SCOPE_GLOBAL };
3033 	int i, mcgmtu;
3034 	boolean_t found = B_FALSE;
3035 
3036 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3037 	mcg_attr.mc_pkey = state->id_pkey;
3038 	state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK;
3039 
3040 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3041 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3042 
3043 		/*
3044 		 * Look for the IPoIB broadcast group.
3045 		 */
3046 		state->id_mgid.gid_prefix =
3047 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3048 		    ((uint64_t)state->id_scope << 48) |
3049 		    ((uint32_t)(state->id_pkey << 16)));
3050 		mcg_attr.mc_mgid = state->id_mgid;
3051 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3052 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3053 			found = B_TRUE;
3054 			break;
3055 		}
3056 
3057 	}
3058 
3059 	if (!found) {
3060 		ibd_print_warn(state, "IPoIB broadcast group absent");
3061 		return (IBT_FAILURE);
3062 	}
3063 
3064 	/*
3065 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3066 	 */
3067 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3068 	if (state->id_mtu < mcgmtu) {
3069 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3070 		    "greater than port's maximum MTU %d", mcgmtu,
3071 		    state->id_mtu);
3072 		return (IBT_FAILURE);
3073 	}
3074 	state->id_mtu = mcgmtu;
3075 
3076 	return (IBT_SUCCESS);
3077 }
3078 
3079 /*
3080  * Post ibt_attach() initialization.
3081  */
3082 static int
3083 ibd_drv_init(ibd_state_t *state)
3084 {
3085 	kthread_t *kht;
3086 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
3087 	ibt_ud_chan_query_attr_t ud_chan_attr;
3088 	ibt_hca_portinfo_t *port_infop;
3089 	ibt_hca_attr_t hca_attrs;
3090 	ibt_status_t ibt_status;
3091 	ibt_cq_attr_t cq_attr;
3092 	ib_guid_t hca_guid;
3093 	uint32_t real_size;
3094 	uint32_t *ptr;
3095 	char pathname[OBP_MAXPATHLEN];
3096 	uint_t psize, port_infosz;
3097 
3098 	/*
3099 	 * Initialize id_port before ibt_open_hca because of
3100 	 * ordering requirements in port up/down handling.
3101 	 */
3102 	if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS)
3103 		return (DDI_FAILURE);
3104 
3105 	if (ibt_open_hca(state->id_ibt_hdl, hca_guid,
3106 	    &state->id_hca_hdl) != IBT_SUCCESS) {
3107 		DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n");
3108 		return (DDI_FAILURE);
3109 	}
3110 
3111 	mutex_enter(&state->id_link_mutex);
3112 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
3113 	    state->id_port, &port_infop, &psize,
3114 	    &port_infosz);
3115 	if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
3116 		mutex_exit(&state->id_link_mutex);
3117 		DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n");
3118 		(void) ibt_close_hca(state->id_hca_hdl);
3119 		return (DDI_FAILURE);
3120 	}
3121 
3122 	/*
3123 	 * If the link already went down by the time we get here, give up;
3124 	 * we can not even get the gid since that is not valid. We would
3125 	 * fail in ibd_find_bgroup() anyway.
3126 	 */
3127 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
3128 		mutex_exit(&state->id_link_mutex);
3129 		ibt_free_portinfo(port_infop, port_infosz);
3130 		(void) ibt_close_hca(state->id_hca_hdl);
3131 		ibd_print_warn(state, "Port is not active");
3132 		return (DDI_FAILURE);
3133 	}
3134 
3135 	/*
3136 	 * This verifies the Pkey ibnexus handed us is still valid.
3137 	 * This is also the point from which the pkey table for the
3138 	 * port must hold the exact pkey value at the exact index
3139 	 * across port up/downs.
3140 	 */
3141 	if (ibt_pkey2index(state->id_hca_hdl, state->id_port,
3142 	    state->id_pkey, &state->id_pkix) != IBT_SUCCESS) {
3143 		mutex_exit(&state->id_link_mutex);
3144 		ibt_free_portinfo(port_infop, port_infosz);
3145 		DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n");
3146 		(void) ibt_close_hca(state->id_hca_hdl);
3147 		return (DDI_FAILURE);
3148 	}
3149 
3150 	state->id_mtu = (128 << port_infop->p_mtu);
3151 	state->id_sgid = *port_infop->p_sgid_tbl;
3152 	state->id_link_state = LINK_STATE_UP;
3153 	mutex_exit(&state->id_link_mutex);
3154 
3155 	ibt_free_portinfo(port_infop, port_infosz);
3156 
3157 	state->id_link_speed = ibd_get_portspeed(state);
3158 
3159 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
3160 	ASSERT(ibt_status == IBT_SUCCESS);
3161 
3162 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
3163 		DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n");
3164 		goto drv_init_fail_find_bgroup;
3165 	}
3166 
3167 	if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
3168 	    &state->id_pd_hdl) != IBT_SUCCESS) {
3169 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n");
3170 		goto drv_init_fail_alloc_pd;
3171 	}
3172 
3173 	/* Initialize the parallel ARP cache and AHs */
3174 	if (ibd_acache_init(state) != DDI_SUCCESS) {
3175 		DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n");
3176 		goto drv_init_fail_acache;
3177 	}
3178 
3179 	/*
3180 	 * Check various tunable limits.
3181 	 */
3182 	if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) {
3183 		ibd_print_warn(state, "Setting #sgl = %d instead of default %d",
3184 		    hca_attrs.hca_max_sgl, IBD_MAX_SQSEG);
3185 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
3186 	} else {
3187 		state->id_max_sqseg = IBD_MAX_SQSEG;
3188 	}
3189 
3190 	/*
3191 	 * First, check #r/s wqes against max channel size.
3192 	 */
3193 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE)
3194 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
3195 	else
3196 		state->id_num_rwqe = IBD_NUM_RWQE;
3197 
3198 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE)
3199 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
3200 	else
3201 		state->id_num_swqe = IBD_NUM_SWQE;
3202 
3203 	/*
3204 	 * Allocate Rx/combined CQ:
3205 	 * Theoretically, there is no point in having more than #rwqe
3206 	 * plus #swqe cqe's, except that the CQ will be signalled for
3207 	 * overflow when the last wqe completes, if none of the previous
3208 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
3209 	 * to make sure such overflow does not occur.
3210 	 */
3211 	cq_attr.cq_sched = NULL;
3212 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
3213 
3214 	if (ibd_separate_cqs == 1) {
3215 		/*
3216 		 * Allocate Receive CQ.
3217 		 */
3218 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
3219 			cq_attr.cq_size = state->id_num_rwqe + 1;
3220 		} else {
3221 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3222 			state->id_num_rwqe = cq_attr.cq_size - 1;
3223 		}
3224 
3225 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3226 			ibd_print_warn(state, "Computed #rwqe %d based on "
3227 			    "requested size and supportable CQ size is less "
3228 			    "than the required threshold %d",
3229 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3230 			goto drv_init_fail_min_rwqes;
3231 		}
3232 
3233 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3234 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3235 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3236 			goto drv_init_fail_alloc_rcq;
3237 		}
3238 		state->id_rxwcs_size = state->id_num_rwqe + 1;
3239 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
3240 		    state->id_rxwcs_size, KM_SLEEP);
3241 
3242 
3243 		/*
3244 		 * Allocate Send CQ.
3245 		 */
3246 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
3247 			cq_attr.cq_size = state->id_num_swqe + 1;
3248 		} else {
3249 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3250 			state->id_num_swqe = cq_attr.cq_size - 1;
3251 		}
3252 
3253 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3254 		    &state->id_scq_hdl, &real_size) != IBT_SUCCESS) {
3255 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3256 			goto drv_init_fail_alloc_scq;
3257 		}
3258 		state->id_txwcs_size = state->id_num_swqe + 1;
3259 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
3260 		    state->id_txwcs_size, KM_SLEEP);
3261 	} else {
3262 		/*
3263 		 * Allocate combined Send/Receive CQ.
3264 		 */
3265 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
3266 		    state->id_num_swqe + 1)) {
3267 			cq_attr.cq_size = state->id_num_rwqe +
3268 			    state->id_num_swqe + 1;
3269 		} else {
3270 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3271 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
3272 			    state->id_num_rwqe) / (state->id_num_rwqe +
3273 			    state->id_num_swqe);
3274 			state->id_num_swqe = cq_attr.cq_size - 1 -
3275 			    state->id_num_rwqe;
3276 		}
3277 
3278 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3279 			ibd_print_warn(state, "Computed #rwqe %d based on "
3280 			    "requested size and supportable CQ size is less "
3281 			    "than the required threshold %d",
3282 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3283 			goto drv_init_fail_min_rwqes;
3284 		}
3285 
3286 		state->id_rxwcs_size = cq_attr.cq_size;
3287 		state->id_txwcs_size = state->id_rxwcs_size;
3288 
3289 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3290 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3291 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3292 			goto drv_init_fail_alloc_rcq;
3293 		}
3294 		state->id_scq_hdl = state->id_rcq_hdl;
3295 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
3296 		    state->id_rxwcs_size, KM_SLEEP);
3297 		state->id_txwcs = state->id_rxwcs;
3298 	}
3299 
3300 	/*
3301 	 * Print message in case we could not allocate as many wqe's
3302 	 * as was requested. Note that in the combined CQ case, we will
3303 	 * get the following message.
3304 	 */
3305 	if (state->id_num_rwqe != IBD_NUM_RWQE)
3306 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
3307 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
3308 	if (state->id_num_swqe != IBD_NUM_SWQE)
3309 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
3310 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
3311 
3312 	ud_alloc_attr.ud_flags	= IBT_WR_SIGNALED;
3313 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
3314 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
3315 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
3316 	ud_alloc_attr.ud_sizes.cs_sq	= state->id_num_swqe;
3317 	ud_alloc_attr.ud_sizes.cs_rq	= state->id_num_rwqe;
3318 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
3319 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
3320 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
3321 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
3322 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
3323 	ud_alloc_attr.ud_clone_chan	= NULL;
3324 	if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
3325 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) {
3326 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()"
3327 		    "\n");
3328 		goto drv_init_fail_alloc_chan;
3329 	}
3330 
3331 	if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) !=
3332 	    DDI_SUCCESS) {
3333 		DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()");
3334 		goto drv_init_fail_query_chan;
3335 	}
3336 	state->id_qpnum = ud_chan_attr.ud_qpn;
3337 
3338 	/* Initialize the Transmit buffer list */
3339 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
3340 		DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n");
3341 		goto drv_init_fail_txlist_init;
3342 	}
3343 
3344 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
3345 		/* Setup the handler we will use for regular DLPI stuff */
3346 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
3347 		if (ibt_enable_cq_notify(state->id_scq_hdl,
3348 		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
3349 			DPRINT(10, "ibd_drv_init : failed in"
3350 			    " ibt_enable_cq_notify()\n");
3351 			goto drv_init_fail_cq_notify;
3352 		}
3353 	}
3354 
3355 	/* Create the service fifos before we start receiving */
3356 	if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos,
3357 	    state)) == NULL) {
3358 		DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n");
3359 		goto drv_init_fail_srv_fifo;
3360 	}
3361 
3362 	/* Initialize the Receive buffer list */
3363 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
3364 		DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n");
3365 		goto drv_init_fail_rxlist_init;
3366 	}
3367 
3368 	/* Join to IPoIB broadcast group as required by IPoIB */
3369 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
3370 		DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n");
3371 		goto drv_init_fail_join_group;
3372 	}
3373 
3374 	/* Create the async thread */
3375 	if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
3376 	    TS_RUN, minclsyspri)) == NULL) {
3377 		/* Do we have to specially leave the group? */
3378 		DPRINT(10, "ibd_drv_init : failed in thread_create\n");
3379 		goto drv_init_fail_thread_create;
3380 	}
3381 	state->id_async_thrid = kht->t_did;
3382 
3383 	/*
3384 	 * The local mac address is now known. Create the IPoIB
3385 	 * address.
3386 	 */
3387 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
3388 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3389 	/*
3390 	 * Similarly, program in the broadcast mac address.
3391 	 */
3392 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix,
3393 	    state->id_mgid.gid_guid);
3394 
3395 	ptr = (uint32_t *)&state->id_macaddr;
3396 	DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n",
3397 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3398 	ptr = (uint32_t *)&state->id_bcaddr;
3399 	DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n",
3400 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3401 	DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n",
3402 	    state->id_pkey, state->id_mgid.gid_prefix,
3403 	    state->id_mgid.gid_guid);
3404 	DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n",
3405 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3406 	DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey);
3407 	DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu);
3408 	(void) ddi_pathname(state->id_dip, pathname);
3409 	DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname);
3410 
3411 	return (DDI_SUCCESS);
3412 
3413 drv_init_fail_thread_create:
3414 	ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL);
3415 
3416 drv_init_fail_join_group:
3417 	ibd_fini_rxlist(state);
3418 
3419 drv_init_fail_rxlist_init:
3420 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3421 
3422 drv_init_fail_srv_fifo:
3423 drv_init_fail_cq_notify:
3424 	ibd_fini_txlist(state);
3425 
3426 drv_init_fail_txlist_init:
3427 drv_init_fail_query_chan:
3428 	if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS)
3429 		DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()");
3430 
3431 drv_init_fail_alloc_chan:
3432 	if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) !=
3433 	    IBT_SUCCESS))
3434 		DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()");
3435 
3436 	if (ibd_separate_cqs == 1)
3437 		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
3438 		    state->id_txwcs_size);
3439 
3440 drv_init_fail_alloc_scq:
3441 	if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS)
3442 		DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()");
3443 	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
3444 
3445 drv_init_fail_min_rwqes:
3446 drv_init_fail_alloc_rcq:
3447 	ibd_acache_fini(state);
3448 drv_init_fail_acache:
3449 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3450 		DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()");
3451 
3452 drv_init_fail_alloc_pd:
3453 	ibt_free_mcg_info(state->id_mcinfo, 1);
3454 drv_init_fail_find_bgroup:
3455 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
3456 		DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()");
3457 
3458 	return (DDI_FAILURE);
3459 }
3460 
3461 /*
3462  * Allocate the statically allocated Tx buffer list.
3463  */
3464 static int
3465 ibd_init_txlist(ibd_state_t *state)
3466 {
3467 	ibd_swqe_t *swqe;
3468 	int i;
3469 
3470 	for (i = 0; i < state->id_num_swqe; i++) {
3471 		if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) {
3472 			DPRINT(10, "ibd_init_txlist : failed in "
3473 			    "ibd_alloc_swqe()\n");
3474 			ibd_fini_txlist(state);
3475 			return (DDI_FAILURE);
3476 		}
3477 
3478 		/* add to list */
3479 		state->id_tx_list.dl_cnt++;
3480 		if (state->id_tx_list.dl_head == NULL) {
3481 			swqe->swqe_prev = NULL;
3482 			swqe->swqe_next = NULL;
3483 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3484 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3485 		} else {
3486 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3487 			swqe->swqe_next = NULL;
3488 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3489 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3490 		}
3491 	}
3492 
3493 	return (DDI_SUCCESS);
3494 }
3495 
3496 /*
3497  * Free the statically allocated Tx buffer list.
3498  */
3499 static void
3500 ibd_fini_txlist(ibd_state_t *state)
3501 {
3502 	ibd_swqe_t *node;
3503 
3504 	mutex_enter(&state->id_tx_list.dl_mutex);
3505 	while (state->id_tx_list.dl_head != NULL) {
3506 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3507 		state->id_tx_list.dl_head = node->swqe_next;
3508 		state->id_tx_list.dl_cnt--;
3509 		ASSERT(state->id_tx_list.dl_cnt >= 0);
3510 		ibd_free_swqe(state, node);
3511 	}
3512 	mutex_exit(&state->id_tx_list.dl_mutex);
3513 }
3514 
3515 /*
3516  * Allocate a single send wqe and register it so it is almost
3517  * ready to be posted to the hardware.
3518  */
3519 static int
3520 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe)
3521 {
3522 	ibt_mr_attr_t mem_attr;
3523 	ibd_swqe_t *swqe;
3524 
3525 	swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP);
3526 	*wqe = swqe;
3527 	swqe->swqe_type = IBD_WQE_SEND;
3528 	swqe->swqe_next = NULL;
3529 	swqe->swqe_prev = NULL;
3530 	swqe->swqe_im_mblk = NULL;
3531 
3532 	/* alloc copy buffer, must be max size to handle multiple mblk case */
3533 	swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP);
3534 
3535 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3536 	mem_attr.mr_len = state->id_mtu;
3537 	mem_attr.mr_as = NULL;
3538 	mem_attr.mr_flags = IBT_MR_SLEEP;
3539 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3540 	    &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) !=
3541 	    IBT_SUCCESS) {
3542 		DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()");
3543 		kmem_free(swqe->swqe_copybuf.ic_bufaddr,
3544 		    state->id_mtu);
3545 		kmem_free(swqe, sizeof (ibd_swqe_t));
3546 		return (DDI_FAILURE);
3547 	}
3548 
3549 	swqe->swqe_copybuf.ic_sgl.ds_va =
3550 	    (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3551 	swqe->swqe_copybuf.ic_sgl.ds_key =
3552 	    swqe->swqe_copybuf.ic_mr_desc.md_lkey;
3553 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3554 
3555 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3556 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3557 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3558 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3559 
3560 	/* These are set in send */
3561 	swqe->w_swr.wr_nds = 0;
3562 	swqe->w_swr.wr_sgl = NULL;
3563 
3564 	return (DDI_SUCCESS);
3565 }
3566 
3567 /*
3568  * Free an allocated send wqe.
3569  */
3570 static void
3571 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3572 {
3573 
3574 	if (ibt_deregister_mr(state->id_hca_hdl,
3575 	    swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3576 		DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()");
3577 		return;
3578 	}
3579 	kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu);
3580 	kmem_free(swqe, sizeof (ibd_swqe_t));
3581 }
3582 
3583 /*
3584  * Post a rwqe to the hardware and add it to the Rx list. The
3585  * "recycle" parameter indicates whether an old rwqe is being
3586  * recycled, or this is a new one.
3587  */
3588 static int
3589 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3590 {
3591 	/*
3592 	 * Here we should add dl_cnt before post recv, because we would
3593 	 * have to make sure dl_cnt has already updated before
3594 	 * corresponding ibd_process_rx() is called.
3595 	 */
3596 	atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3597 	if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) !=
3598 	    IBT_SUCCESS) {
3599 		(void) atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
3600 		DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()");
3601 		return (DDI_FAILURE);
3602 	}
3603 
3604 	/*
3605 	 * Buffers being recycled are already in the list.
3606 	 */
3607 	if (recycle)
3608 		return (DDI_SUCCESS);
3609 
3610 	mutex_enter(&state->id_rx_list.dl_mutex);
3611 	if (state->id_rx_list.dl_head == NULL) {
3612 		rwqe->rwqe_prev = NULL;
3613 		rwqe->rwqe_next = NULL;
3614 		state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3615 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3616 	} else {
3617 		rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3618 		rwqe->rwqe_next = NULL;
3619 		state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3620 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3621 	}
3622 	mutex_exit(&state->id_rx_list.dl_mutex);
3623 
3624 	return (DDI_SUCCESS);
3625 }
3626 
3627 /*
3628  * Allocate the statically allocated Rx buffer list.
3629  */
3630 static int
3631 ibd_init_rxlist(ibd_state_t *state)
3632 {
3633 	ibd_rwqe_t *rwqe;
3634 	int i;
3635 
3636 	for (i = 0; i < state->id_num_rwqe; i++) {
3637 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3638 			ibd_fini_rxlist(state);
3639 			return (DDI_FAILURE);
3640 		}
3641 
3642 		if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) {
3643 			ibd_free_rwqe(state, rwqe);
3644 			ibd_fini_rxlist(state);
3645 			return (DDI_FAILURE);
3646 		}
3647 	}
3648 
3649 	return (DDI_SUCCESS);
3650 }
3651 
3652 /*
3653  * Free the statically allocated Rx buffer list.
3654  *
3655  */
3656 static void
3657 ibd_fini_rxlist(ibd_state_t *state)
3658 {
3659 	ibd_rwqe_t *node;
3660 
3661 	mutex_enter(&state->id_rx_list.dl_mutex);
3662 	while (state->id_rx_list.dl_head != NULL) {
3663 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3664 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3665 		state->id_rx_list.dl_cnt--;
3666 		ASSERT(state->id_rx_list.dl_cnt >= 0);
3667 
3668 		ibd_free_rwqe(state, node);
3669 	}
3670 	mutex_exit(&state->id_rx_list.dl_mutex);
3671 }
3672 
3673 /*
3674  * Allocate a single recv wqe and register it so it is almost
3675  * ready to be posted to the hardware.
3676  */
3677 static int
3678 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3679 {
3680 	ibt_mr_attr_t mem_attr;
3681 	ibd_rwqe_t *rwqe;
3682 
3683 	if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3684 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3685 		return (DDI_FAILURE);
3686 	}
3687 	*wqe = rwqe;
3688 	rwqe->rwqe_type = IBD_WQE_RECV;
3689 	rwqe->w_state = state;
3690 	rwqe->rwqe_next = NULL;
3691 	rwqe->rwqe_prev = NULL;
3692 	rwqe->w_freeing_wqe = B_FALSE;
3693 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3694 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3695 
3696 	if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3697 	    IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) {
3698 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2");
3699 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3700 		return (DDI_FAILURE);
3701 	}
3702 
3703 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3704 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3705 	    NULL) {
3706 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3707 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3708 		    state->id_mtu + IPOIB_GRH_SIZE);
3709 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3710 		return (DDI_FAILURE);
3711 	}
3712 
3713 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3714 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3715 	mem_attr.mr_as = NULL;
3716 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3717 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3718 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3719 	    IBT_SUCCESS) {
3720 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3721 		rwqe->w_freeing_wqe = B_TRUE;
3722 		freemsg(rwqe->rwqe_im_mblk);
3723 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3724 		    state->id_mtu + IPOIB_GRH_SIZE);
3725 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3726 		return (DDI_FAILURE);
3727 	}
3728 
3729 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3730 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3731 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3732 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3733 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3734 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3735 	rwqe->w_rwr.wr_nds = 1;
3736 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3737 
3738 	return (DDI_SUCCESS);
3739 }
3740 
3741 /*
3742  * Free an allocated recv wqe.
3743  */
3744 static void
3745 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3746 {
3747 
3748 	if (ibt_deregister_mr(state->id_hca_hdl,
3749 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3750 		DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()");
3751 		return;
3752 	}
3753 
3754 	/*
3755 	 * Indicate to the callback function that this rwqe/mblk
3756 	 * should not be recycled. The freemsg() will invoke
3757 	 * ibd_freemsg_cb().
3758 	 */
3759 	if (rwqe->rwqe_im_mblk != NULL) {
3760 		rwqe->w_freeing_wqe = B_TRUE;
3761 		freemsg(rwqe->rwqe_im_mblk);
3762 	}
3763 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3764 	    state->id_mtu + IPOIB_GRH_SIZE);
3765 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3766 }
3767 
3768 /*
3769  * Delete the rwqe being freed from the rx list.
3770  */
3771 static void
3772 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3773 {
3774 	mutex_enter(&state->id_rx_list.dl_mutex);
3775 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3776 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3777 	else
3778 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3779 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3780 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3781 	else
3782 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3783 	mutex_exit(&state->id_rx_list.dl_mutex);
3784 }
3785 
3786 /*
3787  * Pre ibt_detach() deconstruction.
3788  */
3789 static void
3790 ibd_drv_fini(ibd_state_t *state)
3791 {
3792 	ib_gid_t mgid;
3793 	ibd_mce_t *mce;
3794 	ibt_status_t status;
3795 	uint8_t jstate;
3796 
3797 	/*
3798 	 * Desubscribe from trap notices; we will be tearing down
3799 	 * the mcg lists soon. Make sure the trap handler does nothing
3800 	 * even if it is invoked (ie till we invoke ibt_detach()).
3801 	 */
3802 	ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
3803 	mutex_enter(&state->id_trap_lock);
3804 	state->id_trap_stop = B_TRUE;
3805 	while (state->id_trap_inprog > 0)
3806 		cv_wait(&state->id_trap_cv, &state->id_trap_lock);
3807 	mutex_exit(&state->id_trap_lock);
3808 
3809 	/*
3810 	 * Flushing the channel ensures that all pending WQE's
3811 	 * are marked with flush_error and handed to the CQ. It
3812 	 * does not guarantee the invocation of the CQ handler.
3813 	 * This call is guaranteed to return successfully for UD QPNs.
3814 	 */
3815 	status = ibt_flush_channel(state->id_chnl_hdl);
3816 	ASSERT(status == IBT_SUCCESS);
3817 
3818 	/*
3819 	 * We possibly need a loop here to wait for all the Tx
3820 	 * callbacks to happen. The Tx handlers will retrieve
3821 	 * held resources like AH ac_ref count, registered memory
3822 	 * and possibly ASYNC_REAP requests. Rx interrupts were already
3823 	 * turned off (in ibd_detach()); turn off Tx interrupts and
3824 	 * poll. By the time the polling returns an empty indicator,
3825 	 * we are sure we have seen all pending Tx callbacks. Note
3826 	 * that after the ibt_set_cq_handler() returns, the old handler
3827 	 * is guaranteed not to be invoked anymore.
3828 	 */
3829 	if (ibd_separate_cqs == 1)
3830 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
3831 	ibd_poll_compq(state, state->id_scq_hdl);
3832 
3833 	/*
3834 	 * No more async requests will be posted since the device has been
3835 	 * unregistered; completion handlers have been turned off, so Tx
3836 	 * handler will not cause any more ASYNC_REAP requests. Queue a
3837 	 * request for the async thread to exit, which will be serviced
3838 	 * after any pending ones. This can take a while, specially if the
3839 	 * SM is unreachable, since IBMF will slowly timeout each SM request
3840 	 * issued by the async thread. Reap the thread before continuing on,
3841 	 * we do not want it to be lingering in modunloaded code.
3842 	 */
3843 	ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT);
3844 	thread_join(state->id_async_thrid);
3845 
3846 	/*
3847 	 * We can not be in promiscuous mode anymore, upper layers
3848 	 * would have made a request to disable it (if ever set previously)
3849 	 * before the detach is allowed to progress to this point; and the
3850 	 * aysnc thread would have processed that request by now. Thus the
3851 	 * nonmember list is guaranteed empty at this point.
3852 	 */
3853 	ASSERT(state->id_prom_op != COMPLETED);
3854 
3855 	/*
3856 	 * Drop all residual full/non membership. This includes full
3857 	 * membership to the broadcast group, and any nonmembership
3858 	 * acquired during transmits. We do this after the Tx completion
3859 	 * handlers are done, since those might result in some late
3860 	 * leaves; this also eliminates a potential race with that
3861 	 * path wrt the mc full list insert/delete. Trap handling
3862 	 * has also been suppressed at this point. Thus, no locks
3863 	 * are required while traversing the mc full list.
3864 	 */
3865 	DPRINT(2, "ibd_drv_fini : clear full cache entries");
3866 	mce = list_head(&state->id_mc_full);
3867 	while (mce != NULL) {
3868 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
3869 		jstate = mce->mc_jstate;
3870 		mce = list_next(&state->id_mc_full, mce);
3871 		ibd_leave_group(state, mgid, jstate);
3872 	}
3873 
3874 	ibt_free_mcg_info(state->id_mcinfo, 1);
3875 
3876 	/*
3877 	 * Kill the channel now; guaranteed to return successfully
3878 	 * for UD QPNs.
3879 	 */
3880 	status = ibt_free_channel(state->id_chnl_hdl);
3881 	ASSERT(status == IBT_SUCCESS);
3882 
3883 	/*
3884 	 * Kill the CQ; all completion handlers are guaranteed to
3885 	 * have terminated by the time this returns. Since we killed
3886 	 * the QPN above, we can not receive the IBT_CQ_BUSY error.
3887 	 */
3888 	status = ibt_free_cq(state->id_rcq_hdl);
3889 	ASSERT(status == IBT_SUCCESS);
3890 	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
3891 
3892 	if (ibd_separate_cqs == 1) {
3893 		status = ibt_free_cq(state->id_scq_hdl);
3894 		ASSERT(status == IBT_SUCCESS);
3895 		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
3896 		    state->id_txwcs_size);
3897 	}
3898 
3899 	/*
3900 	 * We killed the receive interrupts, thus, we will not be
3901 	 * required to handle received packets anymore. Thus, kill
3902 	 * service threads since they are not going to be used anymore.
3903 	 */
3904 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3905 
3906 	/*
3907 	 * Since these following will act on the Rx/Tx list, which
3908 	 * is also looked at by the Rx/Tx handlers, keep them around
3909 	 * till all handlers are guaranteed to have completed.
3910 	 */
3911 	ibd_fini_rxlist(state);
3912 	ibd_fini_txlist(state);
3913 
3914 	/*
3915 	 * Clean up the active AH hash list.
3916 	 */
3917 	mod_hash_destroy_hash(state->id_ah_active_hash);
3918 
3919 	/*
3920 	 * Free parallel ARP cache and AHs; we are sure all of these
3921 	 * resources have been released by the Tx completion handler.
3922 	 */
3923 	ibd_acache_fini(state);
3924 
3925 	/*
3926 	 * We freed the QPN, all the MRs and AHs. This step should not
3927 	 * fail; print a warning message if it does fail, due to a bug
3928 	 * in the driver.
3929 	 */
3930 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3931 		ibd_print_warn(state, "failed to free protection domain");
3932 
3933 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
3934 		ibd_print_warn(state, "failed to close HCA device");
3935 }
3936 
3937 /*
3938  * IBA Rx/Tx completion queue handler. Guaranteed to be single
3939  * threaded and nonreentrant for this CQ. When using combined CQ,
3940  * this handles Tx and Rx completions. With separate CQs, this handles
3941  * only Rx completions.
3942  */
3943 /* ARGSUSED */
3944 static void
3945 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3946 {
3947 	ibd_state_t *state = (ibd_state_t *)arg;
3948 
3949 	atomic_add_64(&state->id_num_intrs, 1);
3950 
3951 	if (ibd_rx_softintr == 1)
3952 		ddi_trigger_softintr(state->id_rx);
3953 	else
3954 		(void) ibd_intr((char *)state);
3955 }
3956 
3957 /*
3958  * Separate CQ handler for Tx completions, when the Tx CQ is in
3959  * interrupt driven mode.
3960  */
3961 /* ARGSUSED */
3962 static void
3963 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3964 {
3965 	ibd_state_t *state = (ibd_state_t *)arg;
3966 
3967 	atomic_add_64(&state->id_num_intrs, 1);
3968 
3969 	if (ibd_tx_softintr == 1)
3970 		ddi_trigger_softintr(state->id_tx);
3971 	else
3972 		(void) ibd_tx_recycle((char *)state);
3973 }
3974 
3975 /*
3976  * Multicast group create/delete trap handler. These will be delivered
3977  * on a kernel thread (handling can thus block) and can be invoked
3978  * concurrently. The handler can be invoked anytime after it is
3979  * registered and before ibt_detach().
3980  */
3981 /* ARGSUSED */
3982 static void
3983 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3984     ibt_subnet_event_t *event)
3985 {
3986 	ibd_state_t *state = (ibd_state_t *)arg;
3987 	ibd_req_t *req;
3988 
3989 	/*
3990 	 * The trap handler will get invoked once for every event for
3991 	 * evert port. The input "gid" is the GID0 of the port the
3992 	 * trap came in on; we just need to act on traps that came
3993 	 * to our port, meaning the port on which the ipoib interface
3994 	 * resides. Since ipoib uses GID0 of the port, we just match
3995 	 * the gids to check whether we need to handle the trap.
3996 	 */
3997 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3998 		return;
3999 
4000 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4001 
4002 	switch (code) {
4003 		case IBT_SM_EVENT_UNAVAILABLE:
4004 			/*
4005 			 * If we are in promiscuous mode or have
4006 			 * sendnonmembers, we need to print a warning
4007 			 * message right now. Else, just store the
4008 			 * information, print when we enter promiscuous
4009 			 * mode or attempt nonmember send. We might
4010 			 * also want to stop caching sendnonmember.
4011 			 */
4012 			ibd_print_warn(state, "IBA multicast support "
4013 			    "degraded due to unavailability of multicast "
4014 			    "traps");
4015 			break;
4016 		case IBT_SM_EVENT_AVAILABLE:
4017 			/*
4018 			 * If we printed a warning message above or
4019 			 * while trying to nonmember send or get into
4020 			 * promiscuous mode, print an okay message.
4021 			 */
4022 			ibd_print_warn(state, "IBA multicast support "
4023 			    "restored due to availability of multicast "
4024 			    "traps");
4025 			break;
4026 		case IBT_SM_EVENT_MCG_CREATED:
4027 		case IBT_SM_EVENT_MCG_DELETED:
4028 			/*
4029 			 * Common processing of creation/deletion traps.
4030 			 * First check if the instance is being
4031 			 * [de]initialized; back off then, without doing
4032 			 * anything more, since we are not sure if the
4033 			 * async thread is around, or whether we might
4034 			 * be racing with the detach code in ibd_drv_fini()
4035 			 * that scans the mcg list.
4036 			 */
4037 			if (!ibd_async_safe(state))
4038 				return;
4039 
4040 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4041 			req->rq_gid = event->sm_notice_gid;
4042 			req->rq_ptr = (void *)code;
4043 			ibd_queue_work_slot(state, req, ASYNC_TRAP);
4044 			break;
4045 	}
4046 }
4047 
4048 static void
4049 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4050 {
4051 	ib_gid_t mgid = req->rq_gid;
4052 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4053 
4054 	DPRINT(10, "ibd_async_trap : %d\n", code);
4055 
4056 	/*
4057 	 * Atomically search the nonmember and sendonlymember lists and
4058 	 * delete.
4059 	 */
4060 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4061 
4062 	if (state->id_prom_op == COMPLETED) {
4063 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4064 
4065 		/*
4066 		 * If in promiscuous mode, try to join/attach to the new
4067 		 * mcg. Given the unreliable out-of-order mode of trap
4068 		 * delivery, we can never be sure whether it is a problem
4069 		 * if the join fails. Thus, we warn the admin of a failure
4070 		 * if this was a creation trap. Note that the trap might
4071 		 * actually be reporting a long past event, and the mcg
4072 		 * might already have been deleted, thus we might be warning
4073 		 * in vain.
4074 		 */
4075 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4076 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4077 			ibd_print_warn(state, "IBA promiscuous mode missed "
4078 			    "new multicast gid %016llx:%016llx",
4079 			    (u_longlong_t)mgid.gid_prefix,
4080 			    (u_longlong_t)mgid.gid_guid);
4081 	}
4082 
4083 	/*
4084 	 * Free the request slot allocated by the subnet event thread.
4085 	 */
4086 	ibd_async_done(state);
4087 }
4088 
4089 /*
4090  * GLDv3 entry point to get capabilities.
4091  */
4092 static boolean_t
4093 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4094 {
4095 	_NOTE(ARGUNUSED(arg));
4096 
4097 	switch (cap) {
4098 	case MAC_CAPAB_HCKSUM: {
4099 		uint32_t *txflags = cap_data;
4100 
4101 		if (ibd_csum_send > IBD_CSUM_NONE)
4102 			*txflags = HCKSUM_INET_PARTIAL;
4103 		else
4104 			return (B_FALSE);
4105 		break;
4106 	}
4107 	case MAC_CAPAB_POLL:
4108 		/*
4109 		 * Fallthrough to default, as we don't support GLDv3
4110 		 * polling.  When blanking is implemented, we will need to
4111 		 * change this to return B_TRUE in addition to registering
4112 		 * an mc_resources callback.
4113 		 */
4114 	default:
4115 		return (B_FALSE);
4116 	}
4117 	return (B_TRUE);
4118 }
4119 
4120 /*
4121  * GLDv3 entry point to start hardware.
4122  */
4123 /* ARGSUSED */
4124 static int
4125 ibd_m_start(void *arg)
4126 {
4127 	return (0);
4128 }
4129 
4130 /*
4131  * GLDv3 entry point to stop hardware from receiving packets.
4132  */
4133 /* ARGSUSED */
4134 static void
4135 ibd_m_stop(void *arg)
4136 {
4137 #ifdef RUN_PERFORMANCE
4138 	ibd_perf((ibd_state_t *)arg);
4139 #endif
4140 }
4141 
4142 /*
4143  * GLDv3 entry point to modify device's mac address. We do not
4144  * allow address modifications.
4145  */
4146 static int
4147 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4148 {
4149 	ibd_state_t *state;
4150 
4151 	state = (ibd_state_t *)arg;
4152 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4153 		return (0);
4154 	else
4155 		return (EINVAL);
4156 }
4157 
4158 /*
4159  * The blocking part of the IBA join/leave operations are done out
4160  * of here on the async thread.
4161  */
4162 static void
4163 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4164 {
4165 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4166 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4167 
4168 	if (op == ASYNC_JOIN) {
4169 
4170 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4171 			ibd_print_warn(state, "Joint multicast group failed :"
4172 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4173 		}
4174 	} else {
4175 		/*
4176 		 * Here, we must search for the proper mcg_info and
4177 		 * use that to leave the group.
4178 		 */
4179 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4180 	}
4181 }
4182 
4183 /*
4184  * GLDv3 entry point for multicast enable/disable requests.
4185  * This function queues the operation to the async thread and
4186  * return success for a valid multicast address.
4187  */
4188 static int
4189 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4190 {
4191 	ibd_state_t *state = (ibd_state_t *)arg;
4192 	ipoib_mac_t maddr, *mcast;
4193 	ib_gid_t mgid;
4194 	ibd_req_t *req;
4195 
4196 	/*
4197 	 * The incoming multicast address might not be aligned properly
4198 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4199 	 * it to look like one though, to get the offsets of the mc gid,
4200 	 * since we know we are not going to dereference any values with
4201 	 * the ipoib_mac_t pointer.
4202 	 */
4203 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4204 	mcast = &maddr;
4205 
4206 	/*
4207 	 * Check validity of MCG address. We could additionally check
4208 	 * that a enable/disable is not being issued on the "broadcast"
4209 	 * mcg, but since this operation is only invokable by priviledged
4210 	 * programs anyway, we allow the flexibility to those dlpi apps.
4211 	 * Note that we do not validate the "scope" of the IBA mcg.
4212 	 */
4213 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4214 		return (EINVAL);
4215 
4216 	/*
4217 	 * fill in multicast pkey and scope
4218 	 */
4219 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4220 
4221 	/*
4222 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4223 	 * nothing (ie we stay JOINed to the broadcast group done in
4224 	 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
4225 	 * requires to be joined to broadcast groups at all times.
4226 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4227 	 * depends on this.
4228 	 */
4229 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4230 		return (0);
4231 
4232 	ibd_n2h_gid(mcast, &mgid);
4233 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4234 	if (req == NULL)
4235 		return (ENOMEM);
4236 
4237 	req->rq_gid = mgid;
4238 
4239 	if (add) {
4240 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4241 		    mgid.gid_prefix, mgid.gid_guid);
4242 		ibd_queue_work_slot(state, req, ASYNC_JOIN);
4243 	} else {
4244 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4245 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4246 		ibd_queue_work_slot(state, req, ASYNC_LEAVE);
4247 	}
4248 	return (0);
4249 }
4250 
4251 /*
4252  * The blocking part of the IBA promiscuous operations are done
4253  * out of here on the async thread. The dlpireq parameter indicates
4254  * whether this invocation is due to a dlpi request or due to
4255  * a port up/down event.
4256  */
4257 static void
4258 ibd_async_unsetprom(ibd_state_t *state)
4259 {
4260 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4261 	ib_gid_t mgid;
4262 
4263 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4264 
4265 	while (mce != NULL) {
4266 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4267 		mce = list_next(&state->id_mc_non, mce);
4268 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4269 	}
4270 	state->id_prom_op = NOTSTARTED;
4271 }
4272 
4273 /*
4274  * The blocking part of the IBA promiscuous operations are done
4275  * out of here on the async thread. The dlpireq parameter indicates
4276  * whether this invocation is due to a dlpi request or due to
4277  * a port up/down event.
4278  */
4279 static void
4280 ibd_async_setprom(ibd_state_t *state)
4281 {
4282 	ibt_mcg_attr_t mcg_attr;
4283 	ibt_mcg_info_t *mcg_info;
4284 	ib_gid_t mgid;
4285 	uint_t numg;
4286 	int i, ret = COMPLETED;
4287 
4288 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4289 
4290 	/*
4291 	 * Obtain all active MC groups on the IB fabric with
4292 	 * specified criteria (scope + Pkey + Qkey + mtu).
4293 	 */
4294 	bzero(&mcg_attr, sizeof (mcg_attr));
4295 	mcg_attr.mc_pkey = state->id_pkey;
4296 	mcg_attr.mc_scope = state->id_scope;
4297 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4298 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4299 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4300 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4301 	    IBT_SUCCESS) {
4302 		ibd_print_warn(state, "Could not get list of IBA multicast "
4303 		    "groups");
4304 		ret = ERRORED;
4305 		goto done;
4306 	}
4307 
4308 	/*
4309 	 * Iterate over the returned mcg's and join as NonMember
4310 	 * to the IP mcg's.
4311 	 */
4312 	for (i = 0; i < numg; i++) {
4313 		/*
4314 		 * Do a NonMember JOIN on the MC group.
4315 		 */
4316 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4317 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4318 			ibd_print_warn(state, "IBA promiscuous mode missed "
4319 			    "multicast gid %016llx:%016llx",
4320 			    (u_longlong_t)mgid.gid_prefix,
4321 			    (u_longlong_t)mgid.gid_guid);
4322 	}
4323 
4324 	ibt_free_mcg_info(mcg_info, numg);
4325 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4326 done:
4327 	state->id_prom_op = ret;
4328 }
4329 
4330 /*
4331  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4332  * GLDv3 assumes phys state receives more packets than multi state,
4333  * which is not true for IPoIB. Thus, treat the multi and phys
4334  * promiscuous states the same way to work with GLDv3's assumption.
4335  */
4336 static int
4337 ibd_m_promisc(void *arg, boolean_t on)
4338 {
4339 	ibd_state_t *state = (ibd_state_t *)arg;
4340 	ibd_req_t *req;
4341 
4342 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4343 	if (req == NULL)
4344 		return (ENOMEM);
4345 	if (on) {
4346 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4347 		ibd_queue_work_slot(state, req, ASYNC_PROMON);
4348 	} else {
4349 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4350 		ibd_queue_work_slot(state, req, ASYNC_PROMOFF);
4351 	}
4352 
4353 	return (0);
4354 }
4355 
4356 /*
4357  * GLDv3 entry point for gathering statistics.
4358  */
4359 static int
4360 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
4361 {
4362 	ibd_state_t *state = (ibd_state_t *)arg;
4363 
4364 	switch (stat) {
4365 	case MAC_STAT_IFSPEED:
4366 		*val = state->id_link_speed;
4367 		break;
4368 	case MAC_STAT_MULTIRCV:
4369 		*val = state->id_multi_rcv;
4370 		break;
4371 	case MAC_STAT_BRDCSTRCV:
4372 		*val = state->id_brd_rcv;
4373 		break;
4374 	case MAC_STAT_MULTIXMT:
4375 		*val = state->id_multi_xmt;
4376 		break;
4377 	case MAC_STAT_BRDCSTXMT:
4378 		*val = state->id_brd_xmt;
4379 		break;
4380 	case MAC_STAT_RBYTES:
4381 		*val = state->id_recv_bytes;
4382 		break;
4383 	case MAC_STAT_IPACKETS:
4384 		*val = state->id_rcv_pkt;
4385 		break;
4386 	case MAC_STAT_OBYTES:
4387 		*val = state->id_xmt_bytes;
4388 		break;
4389 	case MAC_STAT_OPACKETS:
4390 		*val = state->id_xmt_pkt;
4391 		break;
4392 	case MAC_STAT_NORCVBUF:
4393 		*val = state->id_rx_short;	/* # times below water mark */
4394 		break;
4395 	case MAC_STAT_OERRORS:
4396 		*val = state->id_ah_error;	/* failed AH translation */
4397 		break;
4398 	case MAC_STAT_IERRORS:
4399 		*val = 0;
4400 		break;
4401 	case MAC_STAT_NOXMTBUF:
4402 		*val = state->id_tx_short;
4403 		break;
4404 	default:
4405 		return (ENOTSUP);
4406 	}
4407 
4408 	return (0);
4409 }
4410 
4411 /*
4412  * Tx reschedule
4413  */
4414 static void
4415 ibd_async_txsched(ibd_state_t *state)
4416 {
4417 	ibd_req_t *req;
4418 
4419 	/*
4420 	 * For poll mode, if ibd is out of Tx wqe, reschedule to collect
4421 	 * the CQEs. Otherwise, just return for out of Tx wqe.
4422 	 */
4423 
4424 	if (ibd_txcomp_poll == 1) {
4425 		mutex_enter(&state->id_txcomp_lock);
4426 		ibd_poll_compq(state, state->id_scq_hdl);
4427 		mutex_exit(&state->id_txcomp_lock);
4428 		if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) {
4429 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4430 			ibd_queue_work_slot(state, req, ASYNC_SCHED);
4431 			return;
4432 		}
4433 	} else if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) {
4434 		return;
4435 	}
4436 
4437 	if (state->id_sched_needed) {
4438 		mac_tx_update(state->id_mh);
4439 		state->id_sched_needed = B_FALSE;
4440 	}
4441 }
4442 
4443 /*
4444  * Release one or more chained send wqes back into free list.
4445  */
4446 static void
4447 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *swqe)
4448 {
4449 	/*
4450 	 * Add back on Tx list for reuse.
4451 	 */
4452 	swqe->swqe_next = NULL;
4453 	mutex_enter(&state->id_tx_list.dl_mutex);
4454 	if (state->id_tx_list.dl_pending_sends) {
4455 		state->id_tx_list.dl_pending_sends = B_FALSE;
4456 	}
4457 	if (state->id_tx_list.dl_head == NULL) {
4458 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
4459 	} else {
4460 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
4461 	}
4462 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
4463 	state->id_tx_list.dl_cnt++;
4464 	mutex_exit(&state->id_tx_list.dl_mutex);
4465 }
4466 
4467 /*
4468  * Acquire send wqe from free list.
4469  * Returns error number and send wqe pointer.
4470  */
4471 static int
4472 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **swqe)
4473 {
4474 	int rc = 0;
4475 	ibd_swqe_t *wqe;
4476 
4477 	/*
4478 	 * Check and reclaim some of the completed Tx requests.
4479 	 * If someone else is already in this code and pulling Tx
4480 	 * completions, no need to poll, since the current lock holder
4481 	 * will do the work anyway. Normally, we poll for completions
4482 	 * every few Tx attempts, but if we are short on Tx descriptors,
4483 	 * we always try to poll.
4484 	 */
4485 	if ((ibd_txcomp_poll == 1) &&
4486 	    (state->id_tx_list.dl_cnt < IBD_TXPOLL_THRESHOLD) &&
4487 	    (mutex_tryenter(&state->id_txcomp_lock) != 0)) {
4488 		DPRINT(10, "ibd_send : polling");
4489 		ibd_poll_compq(state, state->id_scq_hdl);
4490 		mutex_exit(&state->id_txcomp_lock);
4491 	}
4492 
4493 	/*
4494 	 * Grab required transmit wqes.
4495 	 */
4496 	mutex_enter(&state->id_tx_list.dl_mutex);
4497 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
4498 	if (wqe != NULL) {
4499 		state->id_tx_list.dl_cnt -= 1;
4500 		state->id_tx_list.dl_head = wqe->swqe_next;
4501 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
4502 			state->id_tx_list.dl_tail = NULL;
4503 	} else {
4504 		/*
4505 		 * If we did not find the number we were looking for, flag
4506 		 * no resource. Adjust list appropriately in either case.
4507 		 */
4508 		rc = ENOENT;
4509 		state->id_tx_list.dl_pending_sends = B_TRUE;
4510 		DPRINT(5, "ibd_acquire_swqes: out of Tx wqe");
4511 		atomic_add_64(&state->id_tx_short, 1);
4512 	}
4513 	mutex_exit(&state->id_tx_list.dl_mutex);
4514 	*swqe = wqe;
4515 
4516 	return (rc);
4517 }
4518 
4519 /*
4520  * The passed in packet has this format:
4521  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
4522  */
4523 static boolean_t
4524 ibd_send(ibd_state_t *state, mblk_t *mp)
4525 {
4526 	ibt_status_t ibt_status;
4527 	ibt_mr_attr_t mem_attr;
4528 	ibd_ace_t *ace;
4529 	ibd_swqe_t *node = NULL;
4530 	ipoib_mac_t *dest;
4531 	ibd_req_t *req;
4532 	ib_header_info_t *ipibp;
4533 	ip6_t *ip6h;
4534 	mblk_t *nmp = mp;
4535 	uint_t pktsize;
4536 	size_t	blksize;
4537 	uchar_t *bufp;
4538 	int i, ret, len, nmblks = 1;
4539 	boolean_t dofree = B_TRUE;
4540 
4541 	if ((ret = ibd_acquire_swqes(state, &node)) != 0) {
4542 		state->id_sched_needed = B_TRUE;
4543 		if (ibd_txcomp_poll == 1) {
4544 			goto ibd_send_fail;
4545 		}
4546 		return (B_FALSE);
4547 	}
4548 
4549 	/*
4550 	 * Obtain an address handle for the destination.
4551 	 */
4552 	ipibp = (ib_header_info_t *)mp->b_rptr;
4553 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
4554 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
4555 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
4556 
4557 	pktsize = msgsize(mp);
4558 	atomic_add_64(&state->id_xmt_bytes, pktsize);
4559 	atomic_inc_64(&state->id_xmt_pkt);
4560 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4561 		atomic_inc_64(&state->id_brd_xmt);
4562 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
4563 		atomic_inc_64(&state->id_multi_xmt);
4564 
4565 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
4566 		node->w_ahandle = ace;
4567 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
4568 	} else {
4569 		DPRINT(5,
4570 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
4571 		    ((ret == EFAULT) ? "failed" : "queued"),
4572 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
4573 		    htonl(dest->ipoib_gidpref[1]),
4574 		    htonl(dest->ipoib_gidsuff[0]),
4575 		    htonl(dest->ipoib_gidsuff[1]));
4576 		node->w_ahandle = NULL;
4577 		/*
4578 		 * for the poll mode, it is probably some cqe pending in the
4579 		 * cq. So ibd has to poll cq here, otherwise acache probably
4580 		 * may not be recycled.
4581 		 */
4582 		if (ibd_txcomp_poll == 1) {
4583 			mutex_enter(&state->id_txcomp_lock);
4584 			ibd_poll_compq(state, state->id_scq_hdl);
4585 			mutex_exit(&state->id_txcomp_lock);
4586 		}
4587 		/*
4588 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
4589 		 * can not find a path for the specific dest address. We
4590 		 * should get rid of this kind of packet. With the normal
4591 		 * case, ibd will return the packet to upper layer and wait
4592 		 * for AH creating.
4593 		 */
4594 		if (ret == EFAULT)
4595 			ret = B_TRUE;
4596 		else {
4597 			ret = B_FALSE;
4598 			dofree = B_FALSE;
4599 			state->id_sched_needed = B_TRUE;
4600 		}
4601 		goto ibd_send_fail;
4602 	}
4603 
4604 	/*
4605 	 * For ND6 packets, padding is at the front of the source lladdr.
4606 	 * Insert the padding at front.
4607 	 */
4608 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) {
4609 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
4610 			if (!pullupmsg(mp, IPV6_HDR_LEN +
4611 			    sizeof (ib_header_info_t))) {
4612 				DPRINT(10, "ibd_send: pullupmsg failure ");
4613 				ret = B_TRUE;
4614 				goto ibd_send_fail;
4615 			}
4616 			ipibp = (ib_header_info_t *)mp->b_rptr;
4617 		}
4618 		ip6h = (ip6_t *)((uchar_t *)ipibp +
4619 		    sizeof (ib_header_info_t));
4620 		len = ntohs(ip6h->ip6_plen);
4621 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
4622 			mblk_t	*pad;
4623 
4624 			pad = allocb(4, 0);
4625 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
4626 			linkb(mp, pad);
4627 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
4628 			    IPV6_HDR_LEN + len + 4) {
4629 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
4630 				    IPV6_HDR_LEN + len + 4)) {
4631 					DPRINT(10, "ibd_send: pullupmsg "
4632 					    "failure ");
4633 					ret = B_TRUE;
4634 					goto ibd_send_fail;
4635 				}
4636 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
4637 				    sizeof (ib_header_info_t));
4638 			}
4639 
4640 			/* LINTED: E_CONSTANT_CONDITION */
4641 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
4642 		}
4643 	}
4644 
4645 	mp->b_rptr += sizeof (ib_addrs_t);
4646 	while (((nmp = nmp->b_cont) != NULL) &&
4647 	    (++nmblks < (state->id_max_sqseg + 1)))
4648 	;
4649 
4650 	pktsize = msgsize(mp);
4651 	/*
4652 	 * GLDv3 will check mtu. We do checksum related work here.
4653 	 */
4654 	IBD_CKSUM_SEND(mp);
4655 
4656 	/*
4657 	 * Copy the data to preregistered buffers, or register the buffer.
4658 	 */
4659 	if ((nmblks <= state->id_max_sqseg) &&
4660 	    (pktsize > IBD_TX_COPY_THRESHOLD)) {
4661 		for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) {
4662 			mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr;
4663 			mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr;
4664 			mem_attr.mr_as = NULL;
4665 			mem_attr.mr_flags = IBT_MR_NOSLEEP;
4666 			ibt_status = ibt_register_mr(state->id_hca_hdl,
4667 			    state->id_pd_hdl, &mem_attr,
4668 			    &node->w_smblkbuf[i].im_mr_hdl,
4669 			    &node->w_smblkbuf[i].im_mr_desc);
4670 			if (ibt_status != IBT_SUCCESS) {
4671 				/*
4672 				 * We do not expect any error other than
4673 				 * IBT_INSUFF_RESOURCE.
4674 				 */
4675 				if (ibt_status != IBT_INSUFF_RESOURCE)
4676 					DPRINT(10, "ibd_send: %d\n",
4677 					    "failed in ibt_register_mem()",
4678 					    ibt_status);
4679 				DPRINT(5, "ibd_send: registration failed");
4680 				node->w_swr.wr_nds = i;
4681 				/*
4682 				 * Deregister already registered memory;
4683 				 * fallback to copying the mblk.
4684 				 */
4685 				ibd_deregister_mr(state, node);
4686 				goto ibd_copy_path;
4687 			}
4688 			node->w_smblk_sgl[i].ds_va =
4689 			    (ib_vaddr_t)(uintptr_t)nmp->b_rptr;
4690 			node->w_smblk_sgl[i].ds_key =
4691 			    node->w_smblkbuf[i].im_mr_desc.md_lkey;
4692 			node->w_smblk_sgl[i].ds_len =
4693 			    nmp->b_wptr - nmp->b_rptr;
4694 		}
4695 		node->swqe_im_mblk = mp;
4696 		node->w_swr.wr_sgl = node->w_smblk_sgl;
4697 		node->w_swr.wr_nds = nmblks;
4698 		dofree = B_FALSE;
4699 	} else {
4700 ibd_copy_path:
4701 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
4702 		node->w_swr.wr_nds = 1;
4703 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
4704 
4705 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
4706 		for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
4707 			blksize = MBLKL(nmp);
4708 			bcopy(nmp->b_rptr, bufp, blksize);
4709 			bufp += blksize;
4710 		}
4711 	}
4712 
4713 	/*
4714 	 * Queue the wqe to hardware.
4715 	 */
4716 	ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL);
4717 	if (ibt_status != IBT_SUCCESS) {
4718 		/*
4719 		 * We should not fail here; but just in case we do, we
4720 		 * print out a warning to log.
4721 		 */
4722 		ibd_print_warn(state, "ibd_send: posting failed: %d",
4723 		    ibt_status);
4724 	}
4725 
4726 	DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X",
4727 	    INCTXPACK, htonl(ace->ac_mac.ipoib_qpn),
4728 	    htonl(ace->ac_mac.ipoib_gidpref[0]),
4729 	    htonl(ace->ac_mac.ipoib_gidpref[1]),
4730 	    htonl(ace->ac_mac.ipoib_gidsuff[0]),
4731 	    htonl(ace->ac_mac.ipoib_gidsuff[1]));
4732 
4733 	if (dofree)
4734 		freemsg(mp);
4735 
4736 	return (B_TRUE);
4737 
4738 ibd_send_fail:
4739 	if (state->id_sched_needed == B_TRUE) {
4740 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4741 		if (req != NULL)
4742 			ibd_queue_work_slot(state, req, ASYNC_SCHED);
4743 		else {
4744 			dofree = B_TRUE;
4745 			ret = B_TRUE;
4746 		}
4747 	}
4748 
4749 	if (dofree)
4750 		freemsg(mp);
4751 
4752 	if (node != NULL)
4753 		ibd_tx_cleanup(state, node);
4754 
4755 	return (ret);
4756 }
4757 
4758 /*
4759  * GLDv3 entry point for transmitting datagram.
4760  */
4761 static mblk_t *
4762 ibd_m_tx(void *arg, mblk_t *mp)
4763 {
4764 	ibd_state_t *state = (ibd_state_t *)arg;
4765 	mblk_t *next;
4766 
4767 	while (mp != NULL) {
4768 		next = mp->b_next;
4769 		mp->b_next = NULL;
4770 		if (!ibd_send(state, mp)) {
4771 			/* Send fail */
4772 			mp->b_next = next;
4773 			break;
4774 		}
4775 		mp = next;
4776 	}
4777 
4778 	return (mp);
4779 }
4780 
4781 /*
4782  * this handles Tx and Rx completions. With separate CQs, this handles
4783  * only Rx completions.
4784  */
4785 static uint_t
4786 ibd_intr(char *arg)
4787 {
4788 	ibd_state_t *state = (ibd_state_t *)arg;
4789 	/*
4790 	 * Poll for completed entries; the CQ will not interrupt any
4791 	 * more for incoming (or transmitted) packets.
4792 	 */
4793 	ibd_poll_compq(state, state->id_rcq_hdl);
4794 
4795 	/*
4796 	 * Now enable CQ notifications; all packets that arrive now
4797 	 * (or complete transmission) will cause new interrupts.
4798 	 */
4799 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
4800 	    IBT_SUCCESS) {
4801 		/*
4802 		 * We do not expect a failure here.
4803 		 */
4804 		DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
4805 	}
4806 
4807 	/*
4808 	 * Repoll to catch all packets that might have arrived after
4809 	 * we finished the first poll loop and before interrupts got
4810 	 * armed.
4811 	 */
4812 	ibd_poll_compq(state, state->id_rcq_hdl);
4813 
4814 	return (DDI_INTR_CLAIMED);
4815 }
4816 
4817 /*
4818  * Common code for interrupt handling as well as for polling
4819  * for all completed wqe's while detaching.
4820  */
4821 static void
4822 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
4823 {
4824 	ibd_wqe_t *wqe;
4825 	ibt_wc_t *wc, *wcs;
4826 	uint_t numwcs, real_numwcs;
4827 	int i;
4828 
4829 	/*
4830 	 * In some cases (eg detaching), this code can be invoked on
4831 	 * any cpu after disabling cq notification (thus no concurrency
4832 	 * exists). Apart from that, the following applies normally:
4833 	 * The receive completion handling is always on the Rx interrupt
4834 	 * cpu. Transmit completion handling could be from any cpu if
4835 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
4836 	 * is interrupt driven. Combined completion handling is always
4837 	 * on the interrupt cpu. Thus, lock accordingly and use the
4838 	 * proper completion array.
4839 	 */
4840 	if (ibd_separate_cqs == 1) {
4841 		if (cq_hdl == state->id_rcq_hdl) {
4842 			wcs = state->id_rxwcs;
4843 			numwcs = state->id_rxwcs_size;
4844 		} else {
4845 			wcs = state->id_txwcs;
4846 			numwcs = state->id_txwcs_size;
4847 		}
4848 	} else {
4849 		wcs = state->id_rxwcs;
4850 		numwcs = state->id_rxwcs_size;
4851 	}
4852 
4853 	if (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
4854 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
4855 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
4856 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
4857 			    (wqe->w_type == IBD_WQE_RECV));
4858 			if (wc->wc_status != IBT_WC_SUCCESS) {
4859 				/*
4860 				 * Channel being torn down.
4861 				 */
4862 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
4863 					DPRINT(5, "ibd_intr: flush error");
4864 					/*
4865 					 * Only invoke the Tx handler to
4866 					 * release possibly held resources
4867 					 * like AH refcount etc. Can not
4868 					 * invoke Rx handler because it might
4869 					 * try adding buffers to the Rx pool
4870 					 * when we are trying to deinitialize.
4871 					 */
4872 					if (wqe->w_type == IBD_WQE_RECV) {
4873 						continue;
4874 					} else {
4875 						DPRINT(10, "%s %d",
4876 						    "ibd_intr: Bad CQ status",
4877 						    wc->wc_status);
4878 					}
4879 				}
4880 			}
4881 			if (wqe->w_type == IBD_WQE_SEND) {
4882 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
4883 			} else {
4884 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
4885 			}
4886 		}
4887 	}
4888 }
4889 
4890 /*
4891  * Deregister the mr associated with a given mblk.
4892  */
4893 static void
4894 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe)
4895 {
4896 	int i;
4897 
4898 	DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe,
4899 	    swqe->w_swr.wr_nds);
4900 
4901 	for (i = 0; i < swqe->w_swr.wr_nds; i++) {
4902 		if (ibt_deregister_mr(state->id_hca_hdl,
4903 		    swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) {
4904 			/*
4905 			 * We do not expect any errors here.
4906 			 */
4907 			DPRINT(10, "failed in ibt_deregister_mem()\n");
4908 		}
4909 	}
4910 }
4911 
4912 /*
4913  * Common code that deals with clean ups after a successful or
4914  * erroneous transmission attempt.
4915  */
4916 static void
4917 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
4918 {
4919 	ibd_ace_t *ace = swqe->w_ahandle;
4920 
4921 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
4922 
4923 	/*
4924 	 * If this was a dynamic registration in ibd_send(),
4925 	 * deregister now.
4926 	 */
4927 	if (swqe->swqe_im_mblk != NULL) {
4928 		ibd_deregister_mr(state, swqe);
4929 		freemsg(swqe->swqe_im_mblk);
4930 		swqe->swqe_im_mblk = NULL;
4931 	}
4932 
4933 	/*
4934 	 * Drop the reference count on the AH; it can be reused
4935 	 * now for a different destination if there are no more
4936 	 * posted sends that will use it. This can be eliminated
4937 	 * if we can always associate each Tx buffer with an AH.
4938 	 * The ace can be null if we are cleaning up from the
4939 	 * ibd_send() error path.
4940 	 */
4941 	if (ace != NULL) {
4942 		/*
4943 		 * The recycling logic can be eliminated from here
4944 		 * and put into the async thread if we create another
4945 		 * list to hold ACE's for unjoined mcg's.
4946 		 */
4947 		if (DEC_REF_DO_CYCLE(ace)) {
4948 			ibd_mce_t *mce;
4949 
4950 			/*
4951 			 * Check with the lock taken: we decremented
4952 			 * reference count without the lock, and some
4953 			 * transmitter might alreay have bumped the
4954 			 * reference count (possible in case of multicast
4955 			 * disable when we leave the AH on the active
4956 			 * list). If not still 0, get out, leaving the
4957 			 * recycle bit intact.
4958 			 *
4959 			 * Atomically transition the AH from active
4960 			 * to free list, and queue a work request to
4961 			 * leave the group and destroy the mce. No
4962 			 * transmitter can be looking at the AH or
4963 			 * the MCE in between, since we have the
4964 			 * ac_mutex lock. In the SendOnly reap case,
4965 			 * it is not neccesary to hold the ac_mutex
4966 			 * and recheck the ref count (since the AH was
4967 			 * taken off the active list), we just do it
4968 			 * to have uniform processing with the Full
4969 			 * reap case.
4970 			 */
4971 			mutex_enter(&state->id_ac_mutex);
4972 			mce = ace->ac_mce;
4973 			if (GET_REF_CYCLE(ace) == 0) {
4974 				CLEAR_REFCYCLE(ace);
4975 				/*
4976 				 * Identify the case of fullmember reap as
4977 				 * opposed to mcg trap reap. Also, port up
4978 				 * might set ac_mce to NULL to indicate Tx
4979 				 * cleanup should do no more than put the
4980 				 * AH in the free list (see ibd_async_link).
4981 				 */
4982 				if (mce != NULL) {
4983 					ace->ac_mce = NULL;
4984 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
4985 					/*
4986 					 * mc_req was initialized at mce
4987 					 * creation time.
4988 					 */
4989 					ibd_queue_work_slot(state,
4990 					    &mce->mc_req, ASYNC_REAP);
4991 				}
4992 				IBD_ACACHE_INSERT_FREE(state, ace);
4993 			}
4994 			mutex_exit(&state->id_ac_mutex);
4995 		}
4996 	}
4997 
4998 	/*
4999 	 * Release the send wqe for reuse.
5000 	 */
5001 	ibd_release_swqes(state, swqe);
5002 }
5003 
5004 /*
5005  * Processing to be done after receipt of a packet; hand off to GLD
5006  * in the format expected by GLD.
5007  * The recvd packet has this format: 2b sap :: 00 :: data.
5008  */
5009 static void
5010 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
5011 {
5012 	ib_header_info_t *phdr;
5013 	mblk_t *mp;
5014 	ipoib_hdr_t *ipibp;
5015 	ip6_t *ip6h;
5016 	int rxcnt, len;
5017 
5018 	/*
5019 	 * Track number handed to upper layer, and number still
5020 	 * available to receive packets.
5021 	 */
5022 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
5023 	ASSERT(rxcnt >= 0);
5024 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
5025 
5026 	/*
5027 	 * Adjust write pointer depending on how much data came in.
5028 	 */
5029 	mp = rwqe->rwqe_im_mblk;
5030 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
5031 
5032 	/*
5033 	 * the IB link will deliver one of the IB link layer
5034 	 * headers called, the Global Routing Header (GRH).
5035 	 * ibd driver uses the information in GRH to build the
5036 	 * Header_info structure and pass it with the datagram up
5037 	 * to GLDv3.
5038 	 * If the GRH is not valid, indicate to GLDv3 by setting
5039 	 * the VerTcFlow field to 0.
5040 	 */
5041 	phdr = (ib_header_info_t *)mp->b_rptr;
5042 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
5043 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
5044 
5045 		/* if it is loop back packet, just drop it. */
5046 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
5047 		    IPOIB_ADDRL) == 0) {
5048 			freemsg(mp);
5049 			return;
5050 		}
5051 
5052 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
5053 		    sizeof (ipoib_mac_t));
5054 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
5055 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
5056 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
5057 		} else {
5058 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
5059 		}
5060 	} else {
5061 		/*
5062 		 * It can not be a IBA multicast packet. Must have been
5063 		 * unicast for us. Just copy the interface address to dst.
5064 		 */
5065 		phdr->ib_grh.ipoib_vertcflow = 0;
5066 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
5067 		    sizeof (ipoib_mac_t));
5068 	}
5069 
5070 	DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK);
5071 
5072 	/*
5073 	 * For ND6 packets, padding is at the front of the source/target
5074 	 * lladdr. However the inet6 layer is not aware of it, hence remove
5075 	 * the padding from such packets.
5076 	 */
5077 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
5078 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
5079 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
5080 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5081 			    sizeof (ipoib_hdr_t))) {
5082 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
5083 				freemsg(mp);
5084 				return;
5085 			}
5086 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
5087 			    sizeof (ipoib_pgrh_t));
5088 		}
5089 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
5090 		len = ntohs(ip6h->ip6_plen);
5091 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5092 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
5093 			    IPV6_HDR_LEN + len) {
5094 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
5095 				    IPV6_HDR_LEN + len)) {
5096 					DPRINT(10, "ibd_process_rx: pullupmsg"
5097 					    " failed");
5098 					freemsg(mp);
5099 					return;
5100 				}
5101 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5102 				    sizeof (ipoib_pgrh_t) +
5103 				    sizeof (ipoib_hdr_t));
5104 			}
5105 			/* LINTED: E_CONSTANT_CONDITION */
5106 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
5107 		}
5108 	}
5109 
5110 	atomic_add_64(&state->id_recv_bytes, wc->wc_bytes_xfer);
5111 	atomic_inc_64(&state->id_rcv_pkt);
5112 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5113 		atomic_inc_64(&state->id_brd_rcv);
5114 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5115 		atomic_inc_64(&state->id_multi_rcv);
5116 	/*
5117 	 * Hand off to service thread/GLD. When we have hardware that
5118 	 * does hardware checksum, we will pull the checksum from the
5119 	 * work completion structure here.
5120 	 * on interrupt cpu.
5121 	 */
5122 	ibd_send_up(state, mp);
5123 
5124 	/*
5125 	 * Possibly replenish the Rx pool if needed.
5126 	 */
5127 	if (rxcnt < IBD_RX_THRESHOLD) {
5128 		state->id_rx_short++;
5129 		if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) {
5130 			if (ibd_post_rwqe(state, rwqe, B_FALSE) ==
5131 			    DDI_FAILURE) {
5132 				ibd_free_rwqe(state, rwqe);
5133 				return;
5134 			}
5135 		}
5136 	}
5137 }
5138 
5139 /*
5140  * Callback code invoked from STREAMs when the recv data buffer is free
5141  * for recycling.
5142  */
5143 static void
5144 ibd_freemsg_cb(char *arg)
5145 {
5146 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
5147 	ibd_state_t *state = rwqe->w_state;
5148 
5149 	/*
5150 	 * If the wqe is being destructed, do not attempt recycling.
5151 	 */
5152 	if (rwqe->w_freeing_wqe == B_TRUE) {
5153 		DPRINT(6, "ibd_freemsg: wqe being freed");
5154 		return;
5155 	}
5156 
5157 	/*
5158 	 * Upper layer has released held mblk.
5159 	 */
5160 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
5161 
5162 	if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) {
5163 		/*
5164 		 * There are already enough buffers on the Rx ring.
5165 		 * Free this one up.
5166 		 */
5167 		rwqe->rwqe_im_mblk = NULL;
5168 		ibd_delete_rwqe(state, rwqe);
5169 		ibd_free_rwqe(state, rwqe);
5170 		DPRINT(6, "ibd_freemsg: free up wqe");
5171 	} else {
5172 		rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
5173 		    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
5174 		if (rwqe->rwqe_im_mblk == NULL) {
5175 			ibd_delete_rwqe(state, rwqe);
5176 			ibd_free_rwqe(state, rwqe);
5177 			DPRINT(6, "ibd_freemsg: desballoc failed");
5178 			return;
5179 		}
5180 
5181 		/*
5182 		 * Post back to h/w. We could actually have more than
5183 		 * id_num_rwqe WQEs on the list if there were multiple
5184 		 * ibd_freemsg_cb() calls outstanding (since the lock is
5185 		 * not held the entire time). This will start getting
5186 		 * corrected over subsequent ibd_freemsg_cb() calls.
5187 		 */
5188 		if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) {
5189 			ibd_delete_rwqe(state, rwqe);
5190 			ibd_free_rwqe(state, rwqe);
5191 			return;
5192 		}
5193 	}
5194 }
5195 
5196 static uint_t
5197 ibd_tx_recycle(char *arg)
5198 {
5199 	ibd_state_t *state = (ibd_state_t *)arg;
5200 
5201 	/*
5202 	 * Poll for completed entries; the CQ will not interrupt any
5203 	 * more for completed packets.
5204 	 */
5205 	ibd_poll_compq(state, state->id_scq_hdl);
5206 
5207 	/*
5208 	 * Now enable CQ notifications; all completions originating now
5209 	 * will cause new interrupts.
5210 	 */
5211 	if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) !=
5212 	    IBT_SUCCESS) {
5213 		/*
5214 		 * We do not expect a failure here.
5215 		 */
5216 		DPRINT(10, "ibd_tx_recycle: ibt_enable_cq_notify() failed");
5217 	}
5218 
5219 	/*
5220 	 * Repoll to catch all packets that might have completed after
5221 	 * we finished the first poll loop and before interrupts got
5222 	 * armed.
5223 	 */
5224 	ibd_poll_compq(state, state->id_scq_hdl);
5225 
5226 	/*
5227 	 * Call txsched to notify GLDv3 if it required.
5228 	 */
5229 	ibd_async_txsched(state);
5230 
5231 	return (DDI_INTR_CLAIMED);
5232 }
5233 #ifdef RUN_PERFORMANCE
5234 
5235 /*
5236  * To run the performance test, first do the "ifconfig ibdN plumb" on
5237  * the Rx and Tx side. Then use mdb -kw to tweak the following variables:
5238  * ibd_performance=1.
5239  * ibd_receiver=1 on Rx side.
5240  * ibd_sender=1 on Tx side.
5241  * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update
5242  * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will
5243  * make it drop into a 1 minute loop waiting for packets. An
5244  * ifconfig/unplumb on the Tx will cause it to send packets to Rx.
5245  */
5246 
5247 #define	IBD_NUM_UNSIGNAL	ibd_num_unsignal
5248 #define	IBD_TX_PKTSIZE		ibd_tx_pktsize
5249 #define	IBD_TX_DATASIZE		ibd_tx_datasize
5250 
5251 static ibd_swqe_t **swqes;
5252 static ibt_wc_t *wcs;
5253 
5254 /*
5255  * Set these on Rx and Tx side to do performance run.
5256  */
5257 static int ibd_performance = 0;
5258 static int ibd_receiver = 0;
5259 static int ibd_sender = 0;
5260 static ipoib_mac_t ibd_dest;
5261 
5262 /*
5263  * Interrupt coalescing is achieved by asking for a completion intr
5264  * only every ibd_num_unsignal'th packet.
5265  */
5266 static int ibd_num_unsignal = 8;
5267 
5268 /*
5269  * How big is each packet?
5270  */
5271 static int ibd_tx_pktsize = 2048;
5272 
5273 /*
5274  * Total data size to be transmitted.
5275  */
5276 static int ibd_tx_datasize = 512*1024*1024;
5277 
5278 static volatile boolean_t cq_handler_ran = B_FALSE;
5279 static volatile int num_completions;
5280 
5281 /* ARGSUSED */
5282 static void
5283 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg)
5284 {
5285 	ibd_state_t *state = (ibd_state_t *)arg;
5286 	ibt_cq_hdl_t cqhdl;
5287 	ibd_wqe_t *wqe;
5288 	uint_t polled, i;
5289 	boolean_t cq_enabled = B_FALSE;
5290 
5291 	if (ibd_receiver == 1)
5292 		cqhdl = state->id_rcq_hdl;
5293 	else
5294 		cqhdl = state->id_scq_hdl;
5295 
5296 	/*
5297 	 * Mark the handler as having run and possibly freed up some
5298 	 * slots. Blocked sends can be retried.
5299 	 */
5300 	cq_handler_ran = B_TRUE;
5301 
5302 repoll:
5303 	while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) ==
5304 	    IBT_SUCCESS) {
5305 		num_completions += polled;
5306 		if (ibd_receiver == 1) {
5307 			/*
5308 			 * We can immediately recycle the buffer. No
5309 			 * need to pass up to any IP layer ...
5310 			 */
5311 			for (i = 0; i < polled; i++) {
5312 				wqe = (ibd_wqe_t *)wcs[i].wc_id;
5313 				(void) ibt_post_recv(state->id_chnl_hdl,
5314 				    &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL);
5315 			}
5316 		}
5317 	}
5318 
5319 	/*
5320 	 * If we just repolled, we are done; exit.
5321 	 */
5322 	if (cq_enabled)
5323 		return;
5324 
5325 	/*
5326 	 * Enable CQ.
5327 	 */
5328 	if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
5329 		/*
5330 		 * We do not expect a failure here.
5331 		 */
5332 		cmn_err(CE_CONT, "ibd_perf_handler: notify failed");
5333 	}
5334 	cq_enabled = B_TRUE;
5335 
5336 	/*
5337 	 * Repoll for packets that came in after we finished previous
5338 	 * poll loop but before we turned on notifications.
5339 	 */
5340 	goto repoll;
5341 }
5342 
5343 static void
5344 ibd_perf_tx(ibd_state_t *state)
5345 {
5346 	ibt_mr_hdl_t mrhdl;
5347 	ibt_mr_desc_t mrdesc;
5348 	ibt_mr_attr_t mem_attr;
5349 	ibt_status_t stat;
5350 	ibd_ace_t *ace = NULL;
5351 	ibd_swqe_t *node;
5352 	uchar_t *sendbuf;
5353 	longlong_t stime, etime;
5354 	longlong_t sspin, espin, tspin = 0;
5355 	int i, reps, packets;
5356 
5357 	cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X",
5358 	    htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]),
5359 	    htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]),
5360 	    htonl(ibd_dest.ipoib_gidsuff[1]));
5361 	if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) ||
5362 	    (ibd_dest.ipoib_gidpref[1] == 0)) {
5363 		cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address");
5364 		return;
5365 	}
5366 
5367 	packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE);
5368 	reps = (packets / IBD_NUM_SWQE);
5369 
5370 	cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE);
5371 	cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE);
5372 	cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets);
5373 	cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE);
5374 	cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL);
5375 	if ((packets % IBD_NUM_UNSIGNAL) != 0) {
5376 		/*
5377 		 * This is required to ensure the last packet will trigger
5378 		 * a CQ handler callback, thus we can spin waiting fot all
5379 		 * packets to be received.
5380 		 */
5381 		cmn_err(CE_CONT,
5382 		    "ibd_perf_tx: #Packets not multiple of Signal Grp size");
5383 		return;
5384 	}
5385 	num_completions = 0;
5386 
5387 	swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE,
5388 	    KM_NOSLEEP);
5389 	if (swqes == NULL) {
5390 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5391 		return;
5392 	}
5393 
5394 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5395 	if (wcs == NULL) {
5396 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5397 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5398 		return;
5399 	}
5400 
5401 	/*
5402 	 * Get the ud_dest for the destination.
5403 	 */
5404 	ibd_async_acache(state, &ibd_dest);
5405 	mutex_enter(&state->id_ac_mutex);
5406 	ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0);
5407 	mutex_exit(&state->id_ac_mutex);
5408 	if (ace == NULL) {
5409 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5410 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5411 		cmn_err(CE_CONT, "ibd_perf_tx: no AH");
5412 		return;
5413 	}
5414 
5415 	/*
5416 	 * Set up the send buffer.
5417 	 */
5418 	sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP);
5419 	if (sendbuf == NULL) {
5420 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5421 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5422 		cmn_err(CE_CONT, "ibd_perf_tx: no send buffer");
5423 		return;
5424 	}
5425 
5426 	/*
5427 	 * This buffer can be used in the case when we want to
5428 	 * send data from the same memory area over and over;
5429 	 * it might help in reducing memory traffic.
5430 	 */
5431 	mem_attr.mr_vaddr = (uint64_t)sendbuf;
5432 	mem_attr.mr_len = IBD_TX_PKTSIZE;
5433 	mem_attr.mr_as = NULL;
5434 	mem_attr.mr_flags = IBT_MR_NOSLEEP;
5435 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
5436 	    &mrhdl, &mrdesc) != IBT_SUCCESS) {
5437 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5438 		kmem_free(sendbuf, IBD_TX_PKTSIZE);
5439 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5440 		cmn_err(CE_CONT, "ibd_perf_tx: registration failed");
5441 		return;
5442 	}
5443 
5444 	/*
5445 	 * Allocate private send wqe's.
5446 	 */
5447 	for (i = 0; i < IBD_NUM_SWQE; i++) {
5448 		if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) {
5449 			kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5450 			kmem_free(sendbuf, IBD_TX_PKTSIZE);
5451 			kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5452 			cmn_err(CE_CONT, "ibd_alloc_swqe failure");
5453 			return;
5454 		}
5455 		node->w_ahandle = ace;
5456 #if 0
5457 		node->w_smblkbuf[0].im_mr_hdl = mrhdl;
5458 		node->w_smblkbuf[0].im_mr_desc = mrdesc;
5459 		node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf;
5460 		node->w_smblk_sgl[0].ds_key =
5461 		    node->w_smblkbuf[0].im_mr_desc.md_lkey;
5462 		node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE;
5463 		node->w_swr.wr_sgl = node->w_smblk_sgl;
5464 #else
5465 		node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE;
5466 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5467 #endif
5468 
5469 		/*
5470 		 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs
5471 		 * is marked to invoke the CQ handler. That is the only
5472 		 * way we come to know when the send queue can accept more
5473 		 * WRs.
5474 		 */
5475 		if (((i + 1) % IBD_NUM_UNSIGNAL) != 0)
5476 			node->w_swr.wr_flags = IBT_WR_NO_FLAGS;
5477 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5478 		node->w_swr.wr_nds = 1;
5479 
5480 		swqes[i] = node;
5481 	}
5482 
5483 	ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state);
5484 
5485 	/*
5486 	 * Post all the requests. We expect this stream of post's will
5487 	 * not overwhelm the hardware due to periodic completions and
5488 	 * pollings that happen out of ibd_perf_handler.
5489 	 * Post a set of requests, till the channel can accept; after
5490 	 * that, wait for the CQ handler to notify us that there is more
5491 	 * space.
5492 	 */
5493 	stime = gethrtime();
5494 	for (; reps > 0; reps--)
5495 		for (i = 0; i < IBD_NUM_SWQE; i++) {
5496 			node = swqes[i];
5497 retry:
5498 			if ((stat = ibt_post_send(state->id_chnl_hdl,
5499 			    &node->w_swr, 1, NULL)) != IBT_SUCCESS) {
5500 				if (stat == IBT_CHAN_FULL) {
5501 					/*
5502 					 * Spin till the CQ handler runs
5503 					 * and then try again.
5504 					 */
5505 					sspin = gethrtime();
5506 					while (!cq_handler_ran)
5507 					;
5508 					espin = gethrtime();
5509 					tspin += (espin - sspin);
5510 					cq_handler_ran = B_FALSE;
5511 					goto retry;
5512 				}
5513 				cmn_err(CE_CONT, "post failure %d/%d", stat, i);
5514 				goto done;
5515 			}
5516 		}
5517 
5518 done:
5519 	/*
5520 	 * We should really be snapshotting when we get the last
5521 	 * completion.
5522 	 */
5523 	while (num_completions != (packets / IBD_NUM_UNSIGNAL))
5524 	;
5525 	etime = gethrtime();
5526 
5527 	cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d",
5528 	    num_completions);
5529 	cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime));
5530 	cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin);
5531 
5532 	/*
5533 	 * Wait a sec for everything to get over.
5534 	 */
5535 	delay(drv_usectohz(2000000));
5536 
5537 	/*
5538 	 * Reset CQ handler to real one; free resources.
5539 	 */
5540 	if (ibd_separate_cqs == 0) {
5541 		ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state);
5542 	} else {
5543 		if (ibd_txcomp_poll == 0)
5544 			ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler,
5545 			    state);
5546 		else
5547 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5548 	}
5549 
5550 	for (i = 0; i < IBD_NUM_SWQE; i++)
5551 		ibd_free_swqe(state, swqes[i]);
5552 	(void) ibt_deregister_mr(state->id_hca_hdl, mrhdl);
5553 	kmem_free(sendbuf, IBD_TX_PKTSIZE);
5554 	kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5555 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5556 }
5557 
5558 static void
5559 ibd_perf_rx(ibd_state_t *state)
5560 {
5561 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5562 	if (wcs == NULL) {
5563 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5564 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5565 		return;
5566 	}
5567 
5568 	/*
5569 	 * We do not need to allocate private recv wqe's. We will
5570 	 * just use the regular ones.
5571 	 */
5572 
5573 	num_completions = 0;
5574 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state);
5575 
5576 	/*
5577 	 * Delay for a minute for all the packets to come in from
5578 	 * transmitter.
5579 	 */
5580 	cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE);
5581 	delay(drv_usectohz(60000000));
5582 	cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions);
5583 
5584 	/*
5585 	 * Reset CQ handler to real one; free resources.
5586 	 */
5587 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5588 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5589 }
5590 
5591 static void
5592 ibd_perf(ibd_state_t *state)
5593 {
5594 	if (ibd_performance == 0)
5595 		return;
5596 
5597 	if (ibd_receiver == 1) {
5598 		ibd_perf_rx(state);
5599 		return;
5600 	}
5601 
5602 	if (ibd_sender == 1) {
5603 		ibd_perf_tx(state);
5604 		return;
5605 	}
5606 }
5607 
5608 #endif /* RUN_PERFORMANCE */
5609