xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * An implementation of the IPoIB standard based on PSARC 2001/289.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/conf.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/modctl.h>
37 #include <sys/stropts.h>
38 #include <sys/stream.h>
39 #include <sys/strsun.h>
40 #include <sys/strsubr.h>
41 #include <sys/dlpi.h>
42 
43 #include <sys/pattr.h>		/* for HCK_PARTIALCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/pattr.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Modes of hardware/driver/software checksum, useful for debugging
69  * and performance studies.
70  *
71  * none: h/w (Tavor) and driver does not do checksum, IP software must.
72  * partial: driver does data checksum, IP must provide psuedo header.
73  * perf_partial: driver uses IP provided psuedo cksum as data checksum
74  *		 (thus, real checksumming is not done).
75  */
76 typedef enum {
77 	IBD_CSUM_NONE,
78 	IBD_CSUM_PARTIAL,
79 	IBD_CSUM_PERF_PARTIAL
80 } ibd_csum_type_t;
81 
82 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t;
83 
84 /*
85  * Per interface tunable parameters.
86  */
87 static uint_t ibd_rx_threshold = 16;
88 static uint_t ibd_tx_current_copy_threshold = 0x10000000;
89 static uint_t ibd_num_rwqe = 4095;	/* 1 less than max Tavor CQsize */
90 static uint_t ibd_num_swqe = 4095;	/* 1 less than max Tavor CQsize */
91 static uint_t ibd_num_ah = 16;
92 static uint_t ibd_hash_size = 16;
93 static uint_t ibd_srv_fifos = 0xffff;
94 static uint_t ibd_fifo_depth = 0;
95 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE;
96 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE;
97 
98 /*
99  * The driver can use separate CQs for send and receive queueus.
100  * While using separate CQs, it is possible to put the send CQ
101  * in polling mode, ie not to enable notifications on that CQ.
102  * If both CQs are interrupt driven, currently it is not possible
103  * for their handlers to be invoked concurrently (since Tavor ties
104  * both interrupts to the same PCI intr line); but the handlers
105  * are not coded with a single interrupt cpu assumption (eg
106  * id_num_intrs is incremented atomically).
107  *
108  * The driver private struct uses id_scq_hdl to track the separate
109  * CQ being used for send; the id_rcq_hdl tracks the receive CQ
110  * if using separate CQs, or it tracks the single CQ when using
111  * combined CQ. The id_wcs completion array is used in the combined
112  * CQ case, and for fetching Rx completions in the separate CQs case;
113  * the id_txwcs is used to fetch Tx completions in the separate CQs
114  * case.
115  */
116 static uint_t ibd_separate_cqs = 1;
117 static uint_t ibd_txcomp_poll = 0;
118 
119 /*
120  * Initial number of IBA resources allocated.
121  */
122 #define	IBD_NUM_RWQE	ibd_num_rwqe
123 #define	IBD_NUM_SWQE	ibd_num_swqe
124 #define	IBD_NUM_AH	ibd_num_ah
125 
126 /* when <= threshold, it's faster to copy to a premapped buffer */
127 #define	IBD_TX_COPY_THRESHOLD	ibd_tx_current_copy_threshold
128 
129 /*
130  * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will
131  * allocate a new WQE to put on the the rxlist. This value must be <=
132  * IBD_NUM_RWQE/id_num_rwqe.
133  */
134 #define	IBD_RX_THRESHOLD	ibd_rx_threshold
135 
136 /*
137  * Hash table size for the active AH list.
138  */
139 #define	IBD_HASH_SIZE	ibd_hash_size
140 
141 /*
142  * Size of completion array to be filled by a single poll call.
143  */
144 #define	IBD_WC_SIZE	16
145 
146 /*
147  * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This
148  * is based on our above completion array size.
149  */
150 #define	IBD_TXPOLL_MASK	0xf
151 
152 /*
153  * Number of payload areas the MDT code can support. Choose the same value
154  * that we know is supported by TCP/MDT.
155  */
156 #define	IBD_MDTMAX_SEGS	16
157 
158 /*
159  * PAD routine called during send/recv context
160  */
161 #define	IBD_SEND	0
162 #define	IBD_RECV	1
163 
164 /* Driver State Pointer */
165 void *ibd_list;
166 
167 /* Required system entry points */
168 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
169 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
170 
171 /* Required driver entry points for GLD */
172 static int ibd_reset(gld_mac_info_t *);
173 static int ibd_start(gld_mac_info_t *);
174 static int ibd_stop(gld_mac_info_t *);
175 static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *);
176 static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int);
177 static int ibd_set_promiscuous(gld_mac_info_t *, int);
178 static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *);
179 static int ibd_send(gld_mac_info_t *, mblk_t *);
180 static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **);
181 static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *);
182 static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *);
183 static uint_t ibd_intr(gld_mac_info_t *);
184 
185 /* Private driver entry points for GLD */
186 static int ibd_state_init(ibd_state_t *, dev_info_t *);
187 static void ibd_state_fini(ibd_state_t *);
188 static int ibd_drv_init(ibd_state_t *);
189 static void ibd_drv_fini(ibd_state_t *);
190 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
191 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
192 static void ibd_snet_notices_handler(void *, ib_gid_t,
193     ibt_subnet_event_code_t, ibt_subnet_event_t *);
194 static int ibd_init_txlist(ibd_state_t *);
195 static void ibd_fini_txlist(ibd_state_t *);
196 static int ibd_init_rxlist(ibd_state_t *);
197 static void ibd_fini_rxlist(ibd_state_t *);
198 static void ibd_freemsg_cb(char *);
199 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t);
200 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
201 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **);
202 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
203 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
204 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
205 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
206     ibt_async_event_t *);
207 static int ibd_acache_init(ibd_state_t *);
208 static void ibd_acache_fini(ibd_state_t *);
209 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
210 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
211 static void ibd_async_unsetprom(ibd_state_t *, boolean_t);
212 static void ibd_async_setprom(ibd_state_t *, boolean_t);
213 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
214 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
215 static void ibd_async_txsched(ibd_state_t *);
216 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
217 static void ibd_async_work(ibd_state_t *);
218 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
219 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
220 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t);
221 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *,
222     ipoib_mac_t *);
223 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
224 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *);
225 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
226 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
227 static uint64_t ibd_get_portspeed(ibd_state_t *);
228 
229 #ifdef RUN_PERFORMANCE
230 static void ibd_perf(ibd_state_t *);
231 #endif
232 
233 /* Streams Module Info */
234 static struct module_info ibd_minfo = {
235 	IBD_IDNUM,		/* module ID Number */
236 	"ibd",			/* module name */
237 	0,			/* min packet size */
238 	INFPSZ,			/* maximum packet size */
239 	IBD_HIWAT,		/* high water mark */
240 	IBD_LOWAT		/* low water mark */
241 };
242 
243 /* Streams Read Queue */
244 static struct qinit ibd_rdinit = {
245 	NULL,			/* put */
246 	gld_rsrv,		/* service */
247 	gld_open,		/* open */
248 	gld_close,		/* close */
249 	NULL,			/* unused */
250 	&ibd_minfo,		/* parameters */
251 	NULL			/* statistics */
252 };
253 
254 /* Streams Write Queue */
255 static struct qinit ibd_wrinit = {
256 	gld_wput,		/* put */
257 	gld_wsrv,		/* service */
258 	NULL,			/* open */
259 	NULL,			/* close */
260 	NULL,			/* unused */
261 	&ibd_minfo,		/* parameters */
262 	NULL			/* statistics */
263 };
264 
265 /* Stream Operations */
266 static struct streamtab ibd_streamtab = {
267 	&ibd_rdinit,		/* read queue */
268 	&ibd_wrinit,		/* write queue */
269 	NULL,			/* lower read queue (MUX) */
270 	NULL			/* lower write queue (MUX) */
271 };
272 
273 /* Character/Block Operations */
274 static struct cb_ops ibd_cb_ops = {
275 	nulldev,		/* open */
276 	nulldev,		/* close */
277 	nodev,			/* strategy (block) */
278 	nodev,			/* print (block) */
279 	nodev,			/* dump (block) */
280 	nodev,			/* read */
281 	nodev,			/* write */
282 	nodev,			/* ioctl */
283 	nodev,			/* devmap */
284 	nodev,			/* mmap */
285 	nodev,			/* segmap */
286 	nochpoll,		/* chpoll */
287 	ddi_prop_op,		/* prop_op */
288 	&ibd_streamtab,		/* streams */
289 	D_MP | D_64BIT,		/* flags */
290 	CB_REV			/* rev */
291 };
292 
293 /* Driver Operations */
294 static struct dev_ops ibd_dev_ops = {
295 	DEVO_REV,		/* struct rev */
296 	0,			/* refcnt */
297 	gld_getinfo,		/* getinfo */
298 	nulldev,		/* identify */
299 	nulldev,		/* probe */
300 	ibd_attach,		/* attach */
301 	ibd_detach,		/* detach */
302 	nodev,			/* reset */
303 	&ibd_cb_ops,		/* cb_ops */
304 	NULL,			/* bus_ops */
305 	nodev			/* power */
306 };
307 
308 /* Module Driver Info */
309 static struct modldrv ibd_modldrv = {
310 	&mod_driverops,
311 	"InfiniBand DLPI Driver %I%",
312 	&ibd_dev_ops
313 };
314 
315 /* Module Linkage */
316 static struct modlinkage ibd_modlinkage = {
317 	MODREV_1,
318 	&ibd_modldrv,
319 	NULL
320 };
321 
322 /*
323  * Module Info passed to IBTL during IBT_ATTACH.
324  *   NOTE:  This data must be static (i.e. IBTL just keeps a pointer to this
325  *	    data).
326  */
327 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
328 	IBTI_V2,
329 	IBT_NETWORK,
330 	ibd_async_handler,
331 	NULL,
332 	"IPIB"
333 };
334 
335 /*
336  * Async operation types.
337  */
338 #define	ASYNC_GETAH	1
339 #define	ASYNC_JOIN	2
340 #define	ASYNC_LEAVE	3
341 #define	ASYNC_PROMON	4
342 #define	ASYNC_PROMOFF	5
343 #define	ASYNC_REAP	6
344 #define	ASYNC_POKE	7
345 #define	ASYNC_TRAP	8
346 #define	ASYNC_SCHED	9
347 #define	ASYNC_LINK	10
348 #define	ASYNC_EXIT	11
349 
350 /*
351  * Async operation states
352  */
353 #define	NOTSTARTED	0
354 #define	ONGOING		1
355 #define	COMPLETED	2
356 #define	ERRORED		3
357 #define	ROUTERED	4
358 
359 #define	IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF
360 
361 #ifdef DEBUG
362 
363 static int rxpack = 1, txpack = 1;
364 int debuglevel = 100;
365 static void
366 debug_print(int l, char *fmt, ...)
367 {
368 	va_list ap;
369 
370 	if (l < debuglevel)
371 		return;
372 	va_start(ap, fmt);
373 	vcmn_err(CE_CONT, fmt, ap);
374 	va_end(ap);
375 }
376 #define	INCRXPACK	(rxpack++)
377 #define	INCTXPACK	(txpack++)
378 #define	DPRINT		debug_print
379 
380 #else /* DEBUG */
381 
382 #define	INCRXPACK	0
383 #define	INCTXPACK	0
384 #define	DPRINT
385 
386 #endif /* DEBUG */
387 
388 /*
389  * Common routine to print warning messages; adds in hca guid, port number
390  * and pkey to be able to identify the IBA interface.
391  */
392 static void
393 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
394 {
395 	ib_guid_t hca_guid;
396 	char ibd_print_buf[256];
397 	int len;
398 	va_list ap;
399 
400 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
401 	    0, "hca-guid", 0);
402 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
403 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname,
404 	    state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid,
405 	    state->id_port, state->id_pkey);
406 	va_start(ap, fmt);
407 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
408 	    fmt, ap);
409 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
410 	va_end(ap);
411 }
412 
413 /* warlock directives */
414 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
415     ibd_state_t::id_ah_active))
416 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free))
417 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
418     ibd_state_t::id_req_list))
419 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
420     ibd_state_t::id_acache_req_cv))
421 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
422     ibd_state_t::id_multi_req))
423 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
424     ibd_state_t::id_multi_addr))
425 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
426     ibd_state_t::id_multi_op))
427 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
428     ibd_state_t::id_multi_queued))
429 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
430     ibd_state_t::id_mc_full))
431 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
432     ibd_state_t::id_mc_non))
433 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
434     ibd_state_t::id_link_state))
435 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
436     ibd_state_s::id_tx_list))
437 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
438     ibd_state_s::id_rx_list))
439 
440 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op))
441 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error))
442 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op))
443 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs))
444 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op))
445 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short))
446 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list))
447 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list))
448 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op))
449 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid))
450 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr))
451 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce))
452 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref))
453 
454 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s))
455 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s))
456 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s))
457 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac))
458 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh))
459 
460 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s))
461 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req))
462 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap))
463 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate))
464 
465 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr))
466 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr))
467 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats))
468 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id))
469 
470 #ifdef DEBUG
471 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack))
472 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack))
473 #endif
474 
475 int
476 _init()
477 {
478 	int status;
479 
480 	/*
481 	 * Sanity check some parameter settings. Tx completion polling
482 	 * only makes sense with separate CQs for Tx and Rx.
483 	 */
484 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
485 		cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname,
486 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
487 		ibd_txcomp_poll = 0;
488 	}
489 
490 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
491 	if (status != 0) {
492 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
493 		return (status);
494 	}
495 
496 	status = mod_install(&ibd_modlinkage);
497 	if (status != 0) {
498 		DPRINT(10, "_init:failed in mod_install()");
499 		ddi_soft_state_fini(&ibd_list);
500 		return (status);
501 	}
502 
503 	return (0);
504 }
505 
506 int
507 _info(struct modinfo *modinfop)
508 {
509 	return (mod_info(&ibd_modlinkage, modinfop));
510 }
511 
512 int
513 _fini()
514 {
515 	int status;
516 
517 	status = mod_remove(&ibd_modlinkage);
518 	if (status != 0)
519 		return (status);
520 
521 	ddi_soft_state_fini(&ibd_list);
522 	return (0);
523 }
524 
525 /*
526  * Convert the GID part of the mac address from network byte order
527  * to host order.
528  */
529 static void
530 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
531 {
532 	ib_sn_prefix_t nbopref;
533 	ib_guid_t nboguid;
534 
535 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
536 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
537 	dgid->gid_prefix = b2h64(nbopref);
538 	dgid->gid_guid = b2h64(nboguid);
539 }
540 
541 /*
542  * Create the IPoIB address in network byte order from host order inputs.
543  */
544 static void
545 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
546     ib_guid_t guid)
547 {
548 	ib_sn_prefix_t nbopref;
549 	ib_guid_t nboguid;
550 
551 	mac->ipoib_qpn = htonl(qpn);
552 	nbopref = h2b64(prefix);
553 	nboguid = h2b64(guid);
554 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
555 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
556 }
557 
558 /*
559  * Send to the appropriate all-routers group when the IBA multicast group
560  * does not exist, based on whether the target group is v4 or v6.
561  */
562 static boolean_t
563 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
564     ipoib_mac_t *rmac)
565 {
566 	boolean_t retval = B_TRUE;
567 	uint32_t adjscope = state->id_scope << 16;
568 	uint32_t topword;
569 
570 	/*
571 	 * Copy the first 4 bytes in without assuming any alignment of
572 	 * input mac address; this will have IPoIB signature, flags and
573 	 * scope bits.
574 	 */
575 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
576 	topword = ntohl(topword);
577 
578 	/*
579 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
580 	 */
581 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
582 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
583 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
584 		    ((uint32_t)(state->id_pkey << 16))),
585 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
586 	else
587 		/*
588 		 * Does not have proper bits in the mgid address.
589 		 */
590 		retval = B_FALSE;
591 
592 	return (retval);
593 }
594 
595 /*
596  * Implementation of various (software) flavors of send and receive side
597  * checksumming.
598  */
599 #define	IBD_CKSUM_SEND(mp) {						\
600 	uint32_t start, stuff, end, value, flags;			\
601 	uint32_t cksum, sum;						\
602 	uchar_t *dp, *buf;						\
603 	uint16_t *up;							\
604 									\
605 	if (ibd_csum_send == IBD_CSUM_NONE)				\
606 		goto punt_send;						\
607 									\
608 	/*								\
609 	 * Query IP whether Tx cksum needs to be done.			\
610 	 */								\
611 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end,		\
612 	    &value, &flags);						\
613 									\
614 	if (flags == HCK_PARTIALCKSUM)	{				\
615 		dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE);		\
616 		up =  (uint16_t *)(dp + stuff);				\
617 		if (ibd_csum_send == IBD_CSUM_PARTIAL) {		\
618 			end = ((uchar_t *)mp->b_wptr - dp - start);	\
619 			cksum = *up;					\
620 			*up = 0;					\
621 			/*						\
622 			 * Does NOT handle chained mblks/more than one	\
623 			 * SGL. Applicable only for a single SGL	\
624 			 * entry/mblk, where the stuff offset is	\
625 			 * within the range of buf.			\
626 			 */						\
627 			buf = (dp + start);				\
628 			sum = IP_BCSUM_PARTIAL(buf, end, cksum);	\
629 		} else {						\
630 			sum = *up;					\
631 		}							\
632 		DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n",	\
633 		    start, stuff, end, sum, cksum);			\
634 		sum = ~(sum);						\
635 		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
636 	}								\
637 punt_send:								\
638 	;								\
639 }
640 
641 #define	IBD_CKSUM_RECV(mp) {						\
642 	uchar_t *dp, *buf;						\
643 	uint32_t start, end, value, stuff, flags;			\
644 	uint16_t *up, frag;						\
645 	ipha_t *iphp;							\
646 	ipoib_hdr_t *ipibh;						\
647 									\
648 	if (ibd_csum_recv == IBD_CSUM_NONE)				\
649 		goto punt_recv;					 	\
650 									\
651 	ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\
652 	if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP)		 	\
653 		goto punt_recv;						\
654 									\
655 	dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE);			\
656 	iphp = (ipha_t *)dp;						\
657 	frag = ntohs(iphp->ipha_fragment_offset_and_flags);		\
658 	if ((frag) & (~IPH_DF))						\
659 		goto punt_recv;						\
660 	start = IPH_HDR_LENGTH(iphp);					\
661 	if (iphp->ipha_protocol == IPPROTO_TCP)				\
662 		stuff = start + 16;					\
663 	else if (iphp->ipha_protocol == IPPROTO_UDP)			\
664 		stuff = start + 6;					\
665 	else								\
666 		goto punt_recv;						\
667 									\
668 	flags = HCK_PARTIALCKSUM;					\
669 	end = ntohs(iphp->ipha_length);					\
670 	up = (uint16_t *)(dp + stuff);					\
671 									\
672 	if (ibd_csum_recv == IBD_CSUM_PARTIAL) {			\
673 		buf = (dp + start);					\
674 		value = IP_BCSUM_PARTIAL(buf, end - start, 0);		\
675 	} else {							\
676 		value = (*up);						\
677 	}								\
678 	if (hcksum_assoc(mp, NULL, NULL, start, stuff, end,		\
679 	    value, flags, 0) != 0)					\
680 		DPRINT(10, "cksum_recv: value: %x\n", value);		\
681 punt_recv:								\
682 	;								\
683 }
684 
685 #define	IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) {		\
686 	/*								\
687 	 * Query IP whether Tx cksum needs to be done.			\
688 	 */								\
689 	if (ibd_csum_send != IBD_CSUM_NONE)				\
690 		hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp);	\
691 }
692 
693 #define	IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) {			\
694 	if ((ibd_csum_send != IBD_CSUM_NONE) &&				\
695 	    (fl == HCK_PARTIALCKSUM)) {					\
696 		extern uint_t bcksum(uchar_t *, int, uint32_t);		\
697 		uint16_t *up;						\
698 		uint32_t sum;						\
699 		uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE;	\
700 		int k;							\
701 									\
702 		up = (uint16_t *)(hp + stf);				\
703 		if (ibd_csum_send == IBD_CSUM_PARTIAL) {		\
704 			sum = *up;					\
705 			*up = 0;					\
706 			sum = IP_BCSUM_PARTIAL(hp + st,			\
707 			    PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE,	\
708 			    sum);					\
709 			for (k = 0; k < pinfo->pld_cnt; k++)		\
710 				sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\
711 				    pld_rptr, PDESC_PLDL(pinfo, k),	\
712 				    sum);				\
713 		} else {						\
714 			sum = *up;					\
715 		}							\
716 		sum = ~(sum);						\
717 		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
718 	}								\
719 }
720 
721 /*
722  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
723  * front of optional src/tgt link layer address. Right now Solaris inserts
724  * padding by default at the end. The routine which is doing is nce_xmit()
725  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
726  * the packet comes down from IP layer to the IBD driver, it is in the
727  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
728  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
729  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
730  *
731  * The send routine at IBD driver changes this packet as follows:
732  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
733  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
734  * aligned.
735  *
736  * At the receiving side again ibd_process_rx takes the above packet and
737  * removes the two bytes of front padding and inserts it at the end. This
738  * is since the IP layer does not understand padding at the front.
739  */
740 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
741 	uchar_t 	*nd_lla_ptr;					\
742 	icmp6_t 	*icmp6;						\
743 	nd_opt_hdr_t	*opt;						\
744 	int 		i;						\
745 									\
746 	icmp6 = (icmp6_t *)&ip6h[1];					\
747 	len -= sizeof (nd_neighbor_advert_t);				\
748 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
749 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
750 	    (len != 0)) {						\
751 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
752 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
753 		ASSERT(opt != NULL);					\
754 		nd_lla_ptr = (uchar_t *)&opt[1];			\
755 		if (type == 0) {					\
756 			for (i = IPOIB_ADDRL; i > 0; i--)		\
757 				*(nd_lla_ptr + i + 1) =			\
758 				    *(nd_lla_ptr + i - 1);		\
759 		} else {						\
760 			for (i = 0; i < IPOIB_ADDRL; i++)		\
761 				*(nd_lla_ptr + i) =			\
762 				    *(nd_lla_ptr + i + 2);		\
763 		}							\
764 		*(nd_lla_ptr + i) = 0;					\
765 		*(nd_lla_ptr + i + 1) = 0;				\
766 	}								\
767 }
768 
769 /*
770  * The service fifo code is copied verbatim from Cassini. This can be
771  * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu.
772  */
773 
774 typedef caddr_t fifo_obj_t, *p_fifo_obj_t;
775 
776 typedef struct _srv_fifo_t {
777 	kmutex_t fifo_lock;
778 	kcondvar_t fifo_cv;
779 	size_t size;
780 	uint_t max_index;
781 	uint_t rd_index;
782 	uint_t wr_index;
783 	uint_t objs_pending;
784 	p_fifo_obj_t fifo_objs;
785 	kthread_t *fifo_thread;
786 	void (*drain_func)(caddr_t drain_func_arg);
787 	caddr_t drain_func_arg;
788 	boolean_t running;
789 	callb_cpr_t cprinfo;
790 } srv_fifo_t, *p_srv_fifo_t;
791 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv))
792 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo))
793 
794 static int
795 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size,
796 			void (*drain_func)(), caddr_t drain_func_arg)
797 {
798 	int status;
799 	p_srv_fifo_t srv_fifo;
800 
801 	status = DDI_SUCCESS;
802 	srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP);
803 	srv_fifo->size = size;
804 	srv_fifo->max_index = size - 1;
805 	srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc(
806 	    size * sizeof (fifo_obj_t), KM_SLEEP);
807 	mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL);
808 	cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL);
809 	srv_fifo->drain_func = drain_func;
810 	srv_fifo->drain_func_arg = drain_func_arg;
811 	srv_fifo->running = DDI_SUCCESS;
812 	srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func,
813 	    (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60);
814 	if (srv_fifo->fifo_thread == NULL) {
815 		cv_destroy(&srv_fifo->fifo_cv);
816 		mutex_destroy(&srv_fifo->fifo_lock);
817 		kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t));
818 		kmem_free(srv_fifo, sizeof (srv_fifo_t));
819 		srv_fifo = NULL;
820 		status = DDI_FAILURE;
821 	} else
822 		*handle = srv_fifo;
823 	return (status);
824 }
825 
826 static void
827 _ddi_srv_fifo_destroy(p_srv_fifo_t handle)
828 {
829 	kt_did_t tid = handle->fifo_thread->t_did;
830 
831 	mutex_enter(&handle->fifo_lock);
832 	handle->running = DDI_FAILURE;
833 	cv_signal(&handle->fifo_cv);
834 	while (handle->running == DDI_FAILURE)
835 		cv_wait(&handle->fifo_cv, &handle->fifo_lock);
836 	mutex_exit(&handle->fifo_lock);
837 	if (handle->objs_pending != 0)
838 		cmn_err(CE_NOTE, "!Thread Exit with work undone.");
839 	cv_destroy(&handle->fifo_cv);
840 	mutex_destroy(&handle->fifo_lock);
841 	kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t));
842 	kmem_free(handle, sizeof (srv_fifo_t));
843 	thread_join(tid);
844 }
845 
846 static caddr_t
847 _ddi_srv_fifo_begin(p_srv_fifo_t handle)
848 {
849 #ifndef __lock_lint
850 	CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock,
851 	    callb_generic_cpr, "srv_fifo");
852 #endif /* ! _lock_lint */
853 	return (handle->drain_func_arg);
854 }
855 
856 static void
857 _ddi_srv_fifo_end(p_srv_fifo_t handle)
858 {
859 	callb_cpr_t cprinfo;
860 
861 	mutex_enter(&handle->fifo_lock);
862 	cprinfo = handle->cprinfo;
863 	handle->running = DDI_SUCCESS;
864 	cv_signal(&handle->fifo_cv);
865 #ifndef __lock_lint
866 	CALLB_CPR_EXIT(&cprinfo);
867 #endif /* ! _lock_lint */
868 	thread_exit();
869 	_NOTE(NOT_REACHED)
870 }
871 
872 static int
873 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal)
874 {
875 	int status;
876 
877 	mutex_enter(&handle->fifo_lock);
878 	status = handle->running;
879 	if (status == DDI_SUCCESS) {
880 		if (ptr) {
881 			if (handle->objs_pending < handle->size) {
882 				if (handle->wr_index == handle->max_index)
883 					handle->wr_index = 0;
884 				else
885 					handle->wr_index++;
886 				handle->fifo_objs[handle->wr_index] = ptr;
887 				handle->objs_pending++;
888 			} else
889 				status = DDI_FAILURE;
890 			if (signal)
891 				cv_signal(&handle->fifo_cv);
892 		} else {
893 			if (signal && (handle->objs_pending > 0))
894 				cv_signal(&handle->fifo_cv);
895 		}
896 	}
897 	mutex_exit(&handle->fifo_lock);
898 	return (status);
899 }
900 
901 static int
902 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr)
903 {
904 	int status;
905 
906 	mutex_enter(&handle->fifo_lock);
907 	status = handle->running;
908 	if (status == DDI_SUCCESS) {
909 		if (handle->objs_pending == 0) {
910 #ifndef __lock_lint
911 			CALLB_CPR_SAFE_BEGIN(&handle->cprinfo);
912 			cv_wait(&handle->fifo_cv, &handle->fifo_lock);
913 			CALLB_CPR_SAFE_END(&handle->cprinfo,
914 			    &handle->fifo_lock);
915 #endif /* !_lock_lint */
916 			*ptr = NULL;
917 		}
918 		if (handle->objs_pending > 0) {
919 			if (handle->rd_index == handle->max_index)
920 				handle->rd_index = 0;
921 			else
922 				handle->rd_index++;
923 			*ptr = handle->fifo_objs[handle->rd_index];
924 			handle->objs_pending--;
925 		}
926 		status = handle->running;
927 	} else {
928 		if (handle->objs_pending) {
929 			if (handle->rd_index == handle->max_index)
930 				handle->rd_index = 0;
931 			else
932 				handle->rd_index++;
933 			*ptr = handle->fifo_objs[handle->rd_index];
934 			handle->objs_pending--;
935 			status = DDI_SUCCESS;
936 		} else
937 			status = DDI_FAILURE;
938 	}
939 	mutex_exit(&handle->fifo_lock);
940 	return (status);
941 }
942 
943 /*
944  * [un]map_rx_srv_fifos has been modified from its CE version.
945  */
946 static void
947 drain_fifo(p_srv_fifo_t handle)
948 {
949 	ibd_state_t *state;
950 	mblk_t *mp;
951 
952 	state = (ibd_state_t *)_ddi_srv_fifo_begin(handle);
953 	while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) {
954 		/*
955 		 * Hand off to GLD.
956 		 */
957 		IBD_CKSUM_RECV(mp);
958 		gld_recv(state->id_macinfo, mp);
959 	}
960 	_ddi_srv_fifo_end(handle);
961 }
962 
963 static p_srv_fifo_t *
964 map_rx_srv_fifos(int *nfifos, void *private)
965 {
966 	p_srv_fifo_t *srv_fifos;
967 	int i, inst_taskqs, depth;
968 
969 	/*
970 	 * Default behavior on both sparc and amd cpus in terms of
971 	 * of worker thread is as follows: (N) indicates worker thread
972 	 * not enabled , (Y) indicates worker thread enabled. Default of
973 	 * ibd_srv_fifo is set to 0xffff. The default behavior can be
974 	 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below.
975 	 * Worker thread model assigns lower priority to network
976 	 * processing making system more usable at higher network
977 	 * loads.
978 	 *  ________________________________________________________
979 	 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff |
980 	 * |----------------------|---|---|-------|---|---|---------|
981 	 * |			  |   Sparc	  |   	x86	    |
982 	 * |----------------------|---|---|-------|---|---|---------|
983 	 * | Single CPU		  |N  | Y | N	  | N | Y | N	    |
984 	 * |----------------------|---|---|-------|---|---|---------|
985 	 * | Multi CPU		  |N  | Y | Y	  | N | Y | Y	    |
986 	 * |______________________|___|___|_______|___|___|_________|
987 	 */
988 	if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) ||
989 	    (ibd_srv_fifos == 0)) {
990 		*nfifos = 0;
991 		return ((p_srv_fifo_t *)1);
992 	}
993 
994 	*nfifos = inst_taskqs;
995 	srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t),
996 	    KM_SLEEP);
997 
998 	/*
999 	 * If the administrator has specified a fifo depth, use
1000 	 * that, else just decide what should be the depth.
1001 	 */
1002 	if (ibd_fifo_depth == 0)
1003 		depth = (IBD_NUM_RWQE / inst_taskqs) + 16;
1004 	else
1005 		depth = ibd_fifo_depth;
1006 
1007 	for (i = 0; i < inst_taskqs; i++)
1008 		if (_ddi_srv_fifo_create(&srv_fifos[i],
1009 		    depth, drain_fifo,
1010 		    (caddr_t)private) != DDI_SUCCESS)
1011 			break;
1012 
1013 	if (i < inst_taskqs)
1014 		goto map_rx_srv_fifos_fail1;
1015 
1016 	goto map_rx_srv_fifos_exit;
1017 
1018 map_rx_srv_fifos_fail1:
1019 	i--;
1020 	for (; i >= 0; i--) {
1021 		_ddi_srv_fifo_destroy(srv_fifos[i]);
1022 	}
1023 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
1024 	srv_fifos = NULL;
1025 
1026 map_rx_srv_fifos_exit:
1027 	return (srv_fifos);
1028 }
1029 
1030 static void
1031 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos)
1032 {
1033 	int i;
1034 
1035 	/*
1036 	 * If this interface was not using service fifos, quickly return.
1037 	 */
1038 	if (inst_taskqs == 0)
1039 		return;
1040 
1041 	for (i = 0; i < inst_taskqs; i++) {
1042 		_ddi_srv_fifo_destroy(srv_fifos[i]);
1043 	}
1044 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
1045 }
1046 
1047 /*
1048  * Choose between sending up the packet directly and handing off
1049  * to a service thread.
1050  */
1051 static void
1052 ibd_send_up(ibd_state_t *state, mblk_t *mp)
1053 {
1054 	p_srv_fifo_t *srvfifo;
1055 	ipoib_hdr_t *lhdr;
1056 	struct ip *ip_hdr;
1057 	struct udphdr *tran_hdr;
1058 	uchar_t prot;
1059 	int tnum = -1, nfifos = state->id_nfifos;
1060 
1061 	/*
1062 	 * Quick path if the interface is not using service fifos.
1063 	 */
1064 	if (nfifos == 0) {
1065 hand_off:
1066 		IBD_CKSUM_RECV(mp);
1067 		gld_recv(state->id_macinfo, mp);
1068 		return;
1069 	}
1070 
1071 	/*
1072 	 * Is the packet big enough to look at the IPoIB header
1073 	 * and basic IP header to determine whether it is an
1074 	 * IPv4 packet?
1075 	 */
1076 	if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE +
1077 	    sizeof (struct ip))) {
1078 
1079 		lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE);
1080 
1081 		/*
1082 		 * Is the packet an IP(v4) packet?
1083 		 */
1084 		if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) {
1085 
1086 			ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE +
1087 			    IPOIB_HDRSIZE);
1088 			prot = ip_hdr->ip_p;
1089 
1090 			/*
1091 			 * TCP or UDP packet? We use the UDP header, since
1092 			 * the first few words of both headers are laid out
1093 			 * similarly (src/dest ports).
1094 			 */
1095 			if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) {
1096 
1097 				tran_hdr = (struct udphdr *)(
1098 				    (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2));
1099 
1100 				/*
1101 				 * Are we within limits of this packet? If
1102 				 * so, use the destination port to hash to
1103 				 * a service thread.
1104 				 */
1105 				if (mp->b_wptr >= ((uchar_t *)tran_hdr +
1106 				    sizeof (*tran_hdr)))
1107 					tnum = (ntohs(tran_hdr->uh_dport) +
1108 					    ntohs(tran_hdr->uh_sport)) %
1109 					    nfifos;
1110 			}
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the
1116 	 * packet up in interrupt context, reducing latency.
1117 	 */
1118 	if (tnum == -1) {
1119 		goto hand_off;
1120 	}
1121 
1122 	srvfifo = (p_srv_fifo_t *)state->id_fifos;
1123 	if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp,
1124 	    B_TRUE) != DDI_SUCCESS)
1125 		freemsg(mp);
1126 }
1127 
1128 /*
1129  * Address handle entries maintained by the driver are kept in the
1130  * free and active lists. Each entry starts out in the free list;
1131  * it migrates to the active list when primed using ibt_get_paths()
1132  * and ibt_modify_ud_dest() for transmission to a specific destination.
1133  * In the active list, the entry has a reference count indicating the
1134  * number of ongoing/uncompleted transmits that reference it. The
1135  * entry is left in the active list even after the reference count
1136  * goes to 0, since successive transmits can find it there and do
1137  * not need to set up another entry (ie the path information is
1138  * cached using the active list). Entries on the active list are
1139  * also hashed using the destination link address as a key for faster
1140  * lookups during transmits.
1141  *
1142  * For any destination address (unicast or multicast, whatever the
1143  * join states), there will be at most one entry in the active list.
1144  * Entries with a 0 reference count on the active list can be reused
1145  * for a transmit to a new destination, if the free list is empty.
1146  *
1147  * The AH free list insertion/deletion is protected with the id_ac_mutex,
1148  * since the async thread and Tx callback handlers insert/delete. The
1149  * active list does not need a lock (all operations are done by the
1150  * async thread) but updates to the reference count are atomically
1151  * done (increments done by Tx path, decrements by the Tx callback handler).
1152  */
1153 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
1154 	list_insert_head(&state->id_ah_free, ce)
1155 #define	IBD_ACACHE_GET_FREE(state) \
1156 	list_get_head(&state->id_ah_free)
1157 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
1158 	int _ret_;						\
1159 	list_insert_head(&state->id_ah_active, ce);		\
1160 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
1161 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1162 	ASSERT(_ret_ == 0);					\
1163 }
1164 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
1165 	list_remove(&state->id_ah_active, ce);			\
1166 	(void) mod_hash_remove(state->id_ah_active_hash,	\
1167 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1168 }
1169 #define	IBD_ACACHE_GET_ACTIVE(state) \
1170 	list_get_head(&state->id_ah_active)
1171 
1172 /*
1173  * Membership states for different mcg's are tracked by two lists:
1174  * the "non" list is used for promiscuous mode, when all mcg traffic
1175  * needs to be inspected. This type of membership is never used for
1176  * transmission, so there can not be an AH in the active list
1177  * corresponding to a member in this list. This list does not need
1178  * any protection, since all operations are performed by the async
1179  * thread.
1180  *
1181  * "Full" and "SendOnly" membership is tracked using a single list,
1182  * the "full" list. This is because this single list can then be
1183  * searched during transmit to a multicast group (if an AH for the
1184  * mcg is not found in the active list), since at least one type
1185  * of membership must be present before initiating the transmit.
1186  * This list is also emptied during driver detach, since sendonly
1187  * membership acquired during transmit is dropped at detach time
1188  * alongwith ipv4 broadcast full membership. Insert/deletes to
1189  * this list are done only by the async thread, but it is also
1190  * searched in program context (see multicast disable case), thus
1191  * the id_mc_mutex protects the list. The driver detach path also
1192  * deconstructs the "full" list, but it ensures that the async
1193  * thread will not be accessing the list (by blocking out mcg
1194  * trap handling and making sure no more Tx reaping will happen).
1195  *
1196  * Currently, an IBA attach is done in the SendOnly case too,
1197  * although this is not required.
1198  */
1199 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1200 	list_insert_head(&state->id_mc_full, mce)
1201 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1202 	list_insert_head(&state->id_mc_non, mce)
1203 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1204 	ibd_mcache_find(mgid, &state->id_mc_full)
1205 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1206 	ibd_mcache_find(mgid, &state->id_mc_non)
1207 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1208 	list_remove(&state->id_mc_full, mce)
1209 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1210 	list_remove(&state->id_mc_non, mce)
1211 
1212 /*
1213  * AH and MCE active list manipulation:
1214  *
1215  * Multicast disable requests and MCG delete traps are two cases
1216  * where the active AH entry for the mcg (if any unreferenced one exists)
1217  * will be moved to the free list (to force the next Tx to the mcg to
1218  * join the MCG in SendOnly mode). Port up handling will also move AHs
1219  * from active to free list.
1220  *
1221  * In the case when some transmits are still pending on an entry
1222  * for an mcg, but a multicast disable has already been issued on the
1223  * mcg, there are some options to consider to preserve the join state
1224  * to ensure the emitted packet is properly routed on the IBA fabric.
1225  * For the AH, we can
1226  * 1. take out of active list at multicast disable time.
1227  * 2. take out of active list only when last pending Tx completes.
1228  * For the MCE, we can
1229  * 3. take out of active list at multicast disable time.
1230  * 4. take out of active list only when last pending Tx completes.
1231  * 5. move from active list to stale list at multicast disable time.
1232  * We choose to use 2,4. We use option 4 so that if a multicast enable
1233  * is tried before the pending Tx completes, the enable code finds the
1234  * mce in the active list and just has to make sure it will not be reaped
1235  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1236  * a stale list (#5) that would be checked in the enable code would need
1237  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1238  * after the multicast disable would try to put an AH in the active list,
1239  * and associate the mce it finds in the active list to this new AH,
1240  * whereas the mce is already associated with the previous AH (taken off
1241  * the active list), and will be removed once the pending Tx's complete
1242  * (unless a reference count on mce's is implemented). One implication of
1243  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1244  * grab new references on the AH, further delaying the leave.
1245  *
1246  * In the case of mcg delete (or create) trap when the port is sendonly
1247  * joined, the AH and MCE handling is different: the AH and MCE has to be
1248  * immediately taken off the active lists (forcing a join and path lookup
1249  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1250  * to an mcg as it is repeatedly created and deleted and goes thru
1251  * reincarnations).
1252  *
1253  * When a port is already sendonly joined, and a multicast enable is
1254  * attempted, the same mce structure is promoted; this ensures only a
1255  * single mce on the active list tracks the most powerful join state.
1256  *
1257  * In the case of port up event handling, the MCE for sendonly membership
1258  * is freed up, and the ACE is put into the free list as soon as possible
1259  * (depending on whether posted Tx's have completed). For fullmembership
1260  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1261  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1262  * done; else the mce is deconstructed (mc_fullreap case).
1263  *
1264  * MCG creation and deletion trap handling:
1265  *
1266  * These traps are unreliable (meaning sometimes the trap might never
1267  * be delivered to the subscribed nodes) and may arrive out-of-order
1268  * since they use UD transport. An alternative to relying on these
1269  * unreliable traps is to poll for mcg presence every so often, but
1270  * instead of doing that, we try to be as conservative as possible
1271  * while handling the traps, and hope that the traps do arrive at
1272  * the subscribed nodes soon. Note that if a node is fullmember
1273  * joined to an mcg, it can not possibly receive a mcg create/delete
1274  * trap for that mcg (by fullmember definition); if it does, it is
1275  * an old trap from a previous incarnation of the mcg.
1276  *
1277  * Whenever a trap is received, the driver cleans up its sendonly
1278  * membership to the group; we choose to do a sendonly leave even
1279  * on a creation trap to handle the case of a prior deletion of the mcg
1280  * having gone unnoticed. Consider an example scenario:
1281  * T1: MCG M is deleted, and fires off deletion trap D1.
1282  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1283  * T3: Node N tries to transmit to M, joining in sendonly mode.
1284  * T4: MCG M is deleted, and fires off deletion trap D2.
1285  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1286  *     If the trap is D2, then a LEAVE is not required, since the mcg
1287  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1288  *     approach is to always LEAVE, but the SM may be confused if it
1289  *     receives a LEAVE without a prior JOIN.
1290  *
1291  * Management of the non-membership to an mcg is similar to the above,
1292  * except that if the interface is in promiscuous mode, it is required
1293  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1294  * if the re-join attempt fails (in which case a warning message needs
1295  * to be printed), it is not clear whether it failed due to the mcg not
1296  * existing, or some fabric/hca issues, due to the delayed nature of
1297  * trap delivery. Querying the SA to establish presence/absence of the
1298  * mcg is also racy at best. Thus, the driver just prints a warning
1299  * message when it can not rejoin after receiving a create trap, although
1300  * this might be (on rare occassions) a mis-warning if the create trap is
1301  * received after the mcg was deleted.
1302  */
1303 
1304 /*
1305  * Implementation of atomic "recycle" bits and reference count
1306  * on address handles. This utilizes the fact that max reference
1307  * count on any handle is limited by number of send wqes, thus
1308  * high bits in the ac_ref field can be used as the recycle bits,
1309  * and only the low bits hold the number of pending Tx requests.
1310  * This atomic AH reference counting allows the Tx completion
1311  * handler not to acquire the id_ac_mutex to process every completion,
1312  * thus reducing lock contention problems between completion and
1313  * the Tx path.
1314  */
1315 #define	CYCLEVAL		0x80000
1316 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1317 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1318 #define	GET_REF(ace)		((ace)->ac_ref)
1319 #define	GET_REF_CYCLE(ace) (				\
1320 	/*						\
1321 	 * Make sure "cycle" bit is set.		\
1322 	 */						\
1323 	ASSERT(CYCLE_SET(ace)),				\
1324 	((ace)->ac_ref & ~(CYCLEVAL))			\
1325 )
1326 #define	INC_REF(ace, num) {				\
1327 	atomic_add_32(&(ace)->ac_ref, num);		\
1328 }
1329 #define	SET_CYCLE_IF_REF(ace) (				\
1330 	CYCLE_SET(ace) ? B_TRUE :			\
1331 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1332 		CYCLEVAL ?				\
1333 		/*					\
1334 		 * Clear the "cycle" bit we just set;	\
1335 		 * ref count known to be 0 from above.	\
1336 		 */					\
1337 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1338 		/*					\
1339 		 * We set "cycle" bit; let caller know.	\
1340 		 */					\
1341 		B_TRUE					\
1342 )
1343 #define	DEC_REF_DO_CYCLE(ace) (				\
1344 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1345 	    CYCLEVAL ?					\
1346 		/*					\
1347 		 * Ref count known to be 0 from above.	\
1348 		 */					\
1349 		B_TRUE :				\
1350 		B_FALSE					\
1351 )
1352 
1353 static void *
1354 list_get_head(list_t *list)
1355 {
1356 	list_node_t *lhead = list_head(list);
1357 
1358 	if (lhead != NULL)
1359 		list_remove(list, lhead);
1360 	return (lhead);
1361 }
1362 
1363 /*
1364  * This is always guaranteed to be able to queue the work.
1365  */
1366 static void
1367 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1368 {
1369 	/* Initialize request */
1370 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1371 	ptr->rq_op = op;
1372 
1373 	/*
1374 	 * Queue provided slot onto request pool.
1375 	 */
1376 	mutex_enter(&state->id_acache_req_lock);
1377 	list_insert_tail(&state->id_req_list, ptr);
1378 
1379 	/* Go, fetch, async thread */
1380 	cv_signal(&state->id_acache_req_cv);
1381 	mutex_exit(&state->id_acache_req_lock);
1382 }
1383 
1384 /*
1385  * Main body of the per interface async thread.
1386  */
1387 static void
1388 ibd_async_work(ibd_state_t *state)
1389 {
1390 	ibd_req_t *ptr;
1391 	callb_cpr_t cprinfo;
1392 
1393 	mutex_enter(&state->id_acache_req_lock);
1394 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1395 	    callb_generic_cpr, "ibd_async_work");
1396 	for (;;) {
1397 		ptr = list_get_head(&state->id_req_list);
1398 		if (ptr != NULL) {
1399 			mutex_exit(&state->id_acache_req_lock);
1400 
1401 			/*
1402 			 * Once we have done the operation, there is no
1403 			 * guarantee the request slot is going to be valid,
1404 			 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP).
1405 			 */
1406 
1407 			/* Perform the request */
1408 			switch (ptr->rq_op) {
1409 				case ASYNC_GETAH:
1410 					ibd_async_acache(state, &ptr->rq_mac);
1411 					break;
1412 				case ASYNC_POKE:
1413 					/*
1414 					 * We need the gld_sched; that
1415 					 * happens below. No locks are
1416 					 * needed for the multi_op update.
1417 					 */
1418 					state->id_multi_op = NOTSTARTED;
1419 					break;
1420 				case ASYNC_REAP:
1421 					ibd_async_reap_group(state,
1422 					    ptr->rq_ptr, ptr->rq_gid,
1423 					    IB_MC_JSTATE_FULL);
1424 					break;
1425 				case ASYNC_LEAVE:
1426 				case ASYNC_JOIN:
1427 					ibd_async_multicast(state,
1428 					    ptr->rq_gid, ptr->rq_op);
1429 					break;
1430 				case ASYNC_PROMON:
1431 					ibd_async_setprom(state, B_TRUE);
1432 					break;
1433 				case ASYNC_PROMOFF:
1434 					ibd_async_unsetprom(state, B_TRUE);
1435 					break;
1436 				case ASYNC_TRAP:
1437 					ibd_async_trap(state, ptr);
1438 					break;
1439 				case ASYNC_SCHED:
1440 					ibd_async_txsched(state);
1441 					break;
1442 				case ASYNC_LINK:
1443 					ibd_async_link(state, ptr);
1444 					break;
1445 				case ASYNC_EXIT:
1446 					mutex_enter(&state->id_acache_req_lock);
1447 #ifndef	__lock_lint
1448 					CALLB_CPR_EXIT(&cprinfo);
1449 #endif /* !__lock_lint */
1450 					_NOTE(NOT_REACHED)
1451 					return;
1452 			}
1453 
1454 			/*
1455 			 * Indicate blocked operation can now be retried.
1456 			 * Note gld_sched() gets the gld_maclock,
1457 			 * and the multicast/promiscuous paths
1458 			 * (ibd_set_multicast(), ibd_set_promiscuous())
1459 			 * grab id_acache_req_lock in ibd_queue_work_slot()
1460 			 * with gld_maclock held, so we must not hold the
1461 			 * id_acache_req_lock while calling gld_sched to
1462 			 * prevent deadlock.
1463 			 */
1464 			gld_sched(state->id_macinfo);
1465 
1466 			mutex_enter(&state->id_acache_req_lock);
1467 		} else {
1468 			/*
1469 			 * Nothing to do: wait till new request arrives.
1470 			 */
1471 #ifndef __lock_lint
1472 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1473 			cv_wait(&state->id_acache_req_cv,
1474 			    &state->id_acache_req_lock);
1475 			CALLB_CPR_SAFE_END(&cprinfo,
1476 			    &state->id_acache_req_lock);
1477 #endif /* !_lock_lint */
1478 		}
1479 	}
1480 	/*NOTREACHED*/
1481 }
1482 
1483 /*
1484  * Return when it is safe to queue requests to the async daemon; primarily
1485  * for subnet trap and async event handling. Disallow requests before the
1486  * daemon is created, and when interface deinitilization starts.
1487  */
1488 static boolean_t
1489 ibd_async_safe(ibd_state_t *state)
1490 {
1491 	mutex_enter(&state->id_trap_lock);
1492 	if (state->id_trap_stop) {
1493 		mutex_exit(&state->id_trap_lock);
1494 		return (B_FALSE);
1495 	}
1496 	state->id_trap_inprog++;
1497 	mutex_exit(&state->id_trap_lock);
1498 	return (B_TRUE);
1499 }
1500 
1501 /*
1502  * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
1503  * trap or event handling to complete to kill the async thread and deconstruct
1504  * the mcg/ace list.
1505  */
1506 static void
1507 ibd_async_done(ibd_state_t *state)
1508 {
1509 	mutex_enter(&state->id_trap_lock);
1510 	if (--state->id_trap_inprog == 0)
1511 		cv_signal(&state->id_trap_cv);
1512 	mutex_exit(&state->id_trap_lock);
1513 }
1514 
1515 /*
1516  * Hash functions:
1517  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1518  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1519  * These operate on mac addresses input into ibd_send, but there is no
1520  * guarantee on the alignment of the ipoib_mac_t structure.
1521  */
1522 /*ARGSUSED*/
1523 static uint_t
1524 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1525 {
1526 	ulong_t ptraddr = (ulong_t)key;
1527 	uint_t hval;
1528 
1529 	/*
1530 	 * If the input address is 4 byte aligned, we can just dereference
1531 	 * it. This is most common, since IP will send in a 4 byte aligned
1532 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1533 	 * 4 byte aligned too.
1534 	 */
1535 	if ((ptraddr & 3) == 0)
1536 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1537 
1538 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1539 	return (hval);
1540 }
1541 
1542 static int
1543 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1544 {
1545 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1546 		return (0);
1547 	else
1548 		return (1);
1549 }
1550 
1551 /*
1552  * Initialize all the per interface caches and lists; AH cache,
1553  * MCG list etc.
1554  */
1555 static int
1556 ibd_acache_init(ibd_state_t *state)
1557 {
1558 	ibd_ace_t *ce;
1559 	int i;
1560 
1561 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1562 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1563 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1564 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1565 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1566 	    offsetof(ibd_ace_t, ac_list));
1567 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1568 	    offsetof(ibd_ace_t, ac_list));
1569 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1570 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1571 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1572 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1573 	    offsetof(ibd_mce_t, mc_list));
1574 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1575 	    offsetof(ibd_mce_t, mc_list));
1576 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1577 	    offsetof(ibd_req_t, rq_list));
1578 
1579 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1580 	    IBD_NUM_AH, KM_SLEEP);
1581 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1582 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1583 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1584 			ibd_acache_fini(state);
1585 			return (DDI_FAILURE);
1586 		} else {
1587 			CLEAR_REFCYCLE(ce);
1588 			ce->ac_mce = NULL;
1589 			IBD_ACACHE_INSERT_FREE(state, ce);
1590 		}
1591 	}
1592 	return (DDI_SUCCESS);
1593 }
1594 
1595 static void
1596 ibd_acache_fini(ibd_state_t *state)
1597 {
1598 	ibd_ace_t *ptr;
1599 
1600 	mutex_enter(&state->id_ac_mutex);
1601 
1602 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1603 		ASSERT(GET_REF(ptr) == 0);
1604 		(void) ibt_free_ud_dest(ptr->ac_dest);
1605 	}
1606 
1607 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1608 		ASSERT(GET_REF(ptr) == 0);
1609 		(void) ibt_free_ud_dest(ptr->ac_dest);
1610 	}
1611 
1612 	list_destroy(&state->id_ah_free);
1613 	list_destroy(&state->id_ah_active);
1614 	list_destroy(&state->id_mc_full);
1615 	list_destroy(&state->id_mc_non);
1616 	list_destroy(&state->id_req_list);
1617 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1618 	mutex_exit(&state->id_ac_mutex);
1619 	mutex_destroy(&state->id_ac_mutex);
1620 	mutex_destroy(&state->id_mc_mutex);
1621 	mutex_destroy(&state->id_acache_req_lock);
1622 	cv_destroy(&state->id_acache_req_cv);
1623 }
1624 
1625 /*
1626  * Search AH active hash list for a cached path to input destination.
1627  * If we are "just looking", hold == F. When we are in the Tx path,
1628  * we set hold == T to grab a reference on the AH so that it can not
1629  * be recycled to a new destination while the Tx request is posted.
1630  */
1631 static ibd_ace_t *
1632 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1633 {
1634 	ibd_ace_t *ptr;
1635 
1636 	ASSERT(mutex_owned(&state->id_ac_mutex));
1637 
1638 	/*
1639 	 * Do hash search.
1640 	 */
1641 	if (mod_hash_find(state->id_ah_active_hash,
1642 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1643 		if (hold)
1644 			INC_REF(ptr, num);
1645 		return (ptr);
1646 	}
1647 	return (NULL);
1648 }
1649 
1650 /*
1651  * This is called by the tx side; if an initialized AH is found in
1652  * the active list, it is locked down and can be used; if no entry
1653  * is found, an async request is queued to do path resolution.
1654  */
1655 static ibd_ace_t *
1656 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1657 {
1658 	ibd_ace_t *ptr;
1659 
1660 	/*
1661 	 * Only attempt to print when we can; in the mdt pattr case, the
1662 	 * address is not aligned properly.
1663 	 */
1664 	if (((ulong_t)mac & 3) == 0)
1665 		DPRINT(4,
1666 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1667 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1668 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1669 		    htonl(mac->ipoib_gidsuff[1]));
1670 
1671 	mutex_enter(&state->id_ac_mutex);
1672 
1673 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1674 		mutex_exit(&state->id_ac_mutex);
1675 		return (ptr);
1676 	}
1677 
1678 	/*
1679 	 * Implementation of a single outstanding async request; if
1680 	 * the operation is not started yet, queue a request and move
1681 	 * to ongoing state. Remember in id_ah_addr for which address
1682 	 * we are queueing the request, in case we need to flag an error;
1683 	 * Any further requests, for the same or different address, until
1684 	 * the operation completes, is sent back to GLD to be retried.
1685 	 * The async thread will update id_ah_op with an error indication
1686 	 * or will set it to indicate the next look up can start; either
1687 	 * way, it will gld_sched() so that all blocked requests come
1688 	 * back here.
1689 	 */
1690 	*err = GLD_NORESOURCES;
1691 	if (state->id_ah_op == NOTSTARTED) {
1692 		/*
1693 		 * We did not even find the entry; queue a request for it.
1694 		 */
1695 		bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL);
1696 		ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH);
1697 		state->id_ah_op = ONGOING;
1698 		bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1699 	} else if ((state->id_ah_op != ONGOING) &&
1700 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1701 		/*
1702 		 * Check the status of the pathrecord lookup request
1703 		 * we had queued before.
1704 		 */
1705 		if (state->id_ah_op == ERRORED) {
1706 			*err = GLD_FAILURE;
1707 			state->id_ah_error++;
1708 		} else {
1709 			/*
1710 			 * ROUTERED case: We need to send to the
1711 			 * all-router MCG. If we can find the AH for
1712 			 * the mcg, the Tx will be attempted. If we
1713 			 * do not find the AH, we return NORESOURCES
1714 			 * to retry.
1715 			 */
1716 			ipoib_mac_t routermac;
1717 
1718 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1719 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1720 			    numwqe);
1721 		}
1722 		state->id_ah_op = NOTSTARTED;
1723 	} else if ((state->id_ah_op != ONGOING) &&
1724 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1725 		/*
1726 		 * This case can happen when we get a higher band
1727 		 * packet. The easiest way is to reset the state machine
1728 		 * to accommodate the higher priority packet.
1729 		 */
1730 		state->id_ah_op = NOTSTARTED;
1731 	}
1732 	mutex_exit(&state->id_ac_mutex);
1733 
1734 	/*
1735 	 * The PathRecord lookup failed; retry any other blocked
1736 	 * Tx requests that might have come in between when we
1737 	 * initiated the path lookup and now that were sent back
1738 	 * to GLD to implement single outstanding lookup scheme.
1739 	 */
1740 	if (*err == GLD_FAILURE)
1741 		gld_sched(state->id_macinfo);
1742 	return (ptr);
1743 }
1744 
1745 /*
1746  * Grab a not-currently-in-use AH/PathRecord from the active
1747  * list to recycle to a new destination. Only the async thread
1748  * executes this code.
1749  */
1750 static ibd_ace_t *
1751 ibd_acache_get_unref(ibd_state_t *state)
1752 {
1753 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1754 
1755 	ASSERT(mutex_owned(&state->id_ac_mutex));
1756 
1757 	/*
1758 	 * Do plain linear search.
1759 	 */
1760 	while (ptr != NULL) {
1761 		/*
1762 		 * Note that it is possible that the "cycle" bit
1763 		 * is set on the AH w/o any reference count. The
1764 		 * mcg must have been deleted, and the tx cleanup
1765 		 * just decremented the reference count to 0, but
1766 		 * hasn't gotten around to grabbing the id_ac_mutex
1767 		 * to move the AH into the free list.
1768 		 */
1769 		if (GET_REF(ptr) == 0) {
1770 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1771 			break;
1772 		}
1773 		ptr = list_next(&state->id_ah_active, ptr);
1774 	}
1775 	return (ptr);
1776 }
1777 
1778 /*
1779  * Invoked to clean up AH from active list in case of multicast
1780  * disable and to handle sendonly memberships during mcg traps.
1781  * And for port up processing for multicast and unicast AHs.
1782  * Normally, the AH is taken off the active list, and put into
1783  * the free list to be recycled for a new destination. In case
1784  * Tx requests on the AH have not completed yet, the AH is marked
1785  * for reaping (which will put the AH on the free list) once the Tx's
1786  * complete; in this case, depending on the "force" input, we take
1787  * out the AH from the active list right now, or leave it also for
1788  * the reap operation. Returns TRUE if the AH is taken off the active
1789  * list (and either put into the free list right now, or arranged for
1790  * later), FALSE otherwise.
1791  */
1792 static boolean_t
1793 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1794 {
1795 	ibd_ace_t *acactive;
1796 	boolean_t ret = B_TRUE;
1797 
1798 	ASSERT(mutex_owned(&state->id_ac_mutex));
1799 
1800 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1801 
1802 		/*
1803 		 * Note that the AH might already have the cycle bit set
1804 		 * on it; this might happen if sequences of multicast
1805 		 * enables and disables are coming so fast, that posted
1806 		 * Tx's to the mcg have not completed yet, and the cycle
1807 		 * bit is set successively by each multicast disable.
1808 		 */
1809 		if (SET_CYCLE_IF_REF(acactive)) {
1810 			if (!force) {
1811 				/*
1812 				 * The ace is kept on the active list, further
1813 				 * Tx's can still grab a reference on it; the
1814 				 * ace is reaped when all pending Tx's
1815 				 * referencing the AH complete.
1816 				 */
1817 				ret = B_FALSE;
1818 			} else {
1819 				/*
1820 				 * In the mcg trap case, we always pull the
1821 				 * AH from the active list. And also the port
1822 				 * up multi/unicast case.
1823 				 */
1824 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1825 				acactive->ac_mce = NULL;
1826 			}
1827 		} else {
1828 			/*
1829 			 * Determined the ref count is 0, thus reclaim
1830 			 * immediately after pulling out the ace from
1831 			 * the active list.
1832 			 */
1833 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1834 			acactive->ac_mce = NULL;
1835 			IBD_ACACHE_INSERT_FREE(state, acactive);
1836 		}
1837 
1838 	}
1839 	return (ret);
1840 }
1841 
1842 /*
1843  * Helper function for async path record lookup. If we are trying to
1844  * Tx to a MCG, check our membership, possibly trying to join the
1845  * group if required. If that fails, try to send the packet to the
1846  * all router group (indicated by the redirect output), pointing
1847  * the input mac address to the router mcg address.
1848  */
1849 static ibd_mce_t *
1850 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1851 {
1852 	ib_gid_t mgid;
1853 	ibd_mce_t *mce;
1854 	ipoib_mac_t routermac;
1855 
1856 	*redirect = B_FALSE;
1857 	ibd_n2h_gid(mac, &mgid);
1858 
1859 	/*
1860 	 * Check the FullMember+SendOnlyNonMember list.
1861 	 * Since we are the only one who manipulates the
1862 	 * id_mc_full list, no locks are needed.
1863 	 */
1864 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1865 	if (mce != NULL) {
1866 		DPRINT(4, "ibd_async_mcache : already joined to group");
1867 		return (mce);
1868 	}
1869 
1870 	/*
1871 	 * Not found; try to join(SendOnlyNonMember) and attach.
1872 	 */
1873 	DPRINT(4, "ibd_async_mcache : not joined to group");
1874 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1875 	    NULL) {
1876 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1877 		return (mce);
1878 	}
1879 
1880 	/*
1881 	 * MCGroup not present; try to join the all-router group. If
1882 	 * any of the following steps succeed, we will be redirecting
1883 	 * to the all router group.
1884 	 */
1885 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1886 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1887 		return (NULL);
1888 	*redirect = B_TRUE;
1889 	ibd_n2h_gid(&routermac, &mgid);
1890 	bcopy(&routermac, mac, IPOIB_ADDRL);
1891 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1892 	    mgid.gid_prefix, mgid.gid_guid);
1893 
1894 	/*
1895 	 * Are we already joined to the router group?
1896 	 */
1897 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1898 		DPRINT(4, "ibd_async_mcache : using already joined router"
1899 		    "group\n");
1900 		return (mce);
1901 	}
1902 
1903 	/*
1904 	 * Can we join(SendOnlyNonMember) the router group?
1905 	 */
1906 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1907 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1908 	    NULL) {
1909 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1910 		return (mce);
1911 	}
1912 
1913 	return (NULL);
1914 }
1915 
1916 /*
1917  * Async path record lookup code.
1918  */
1919 static void
1920 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1921 {
1922 	ibd_ace_t *ce;
1923 	ibd_mce_t *mce = NULL;
1924 	ibt_path_attr_t path_attr;
1925 	ibt_path_info_t path_info;
1926 	ib_gid_t destgid;
1927 	int ret = NOTSTARTED;
1928 
1929 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1930 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1931 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1932 	    htonl(mac->ipoib_gidsuff[1]));
1933 
1934 	/*
1935 	 * Check whether we are trying to transmit to a MCG.
1936 	 * In that case, we need to make sure we are a member of
1937 	 * the MCG.
1938 	 */
1939 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1940 		boolean_t redirected;
1941 
1942 		/*
1943 		 * If we can not find or join the group or even
1944 		 * redirect, error out.
1945 		 */
1946 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1947 		    NULL) {
1948 			state->id_ah_op = ERRORED;
1949 			return;
1950 		}
1951 
1952 		/*
1953 		 * If we got redirected, we need to determine whether
1954 		 * the AH for the new mcg is in the cache already, and
1955 		 * not pull it in then; otherwise proceed to get the
1956 		 * path for the new mcg. There is no guarantee that
1957 		 * if the AH is currently in the cache, it will still be
1958 		 * there when we look in ibd_acache_lookup(), but that's
1959 		 * okay, we will come back here.
1960 		 */
1961 		if (redirected) {
1962 			ret = ROUTERED;
1963 			DPRINT(4, "ibd_async_acache :  redirected to "
1964 			    "%08X:%08X:%08X:%08X:%08X",
1965 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1966 			    htonl(mac->ipoib_gidpref[1]),
1967 			    htonl(mac->ipoib_gidsuff[0]),
1968 			    htonl(mac->ipoib_gidsuff[1]));
1969 
1970 			mutex_enter(&state->id_ac_mutex);
1971 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1972 				mutex_exit(&state->id_ac_mutex);
1973 				DPRINT(4, "ibd_async_acache : router AH found");
1974 				state->id_ah_op = ROUTERED;
1975 				return;
1976 			}
1977 			mutex_exit(&state->id_ac_mutex);
1978 		}
1979 	}
1980 
1981 	/*
1982 	 * Get an AH from the free list.
1983 	 */
1984 	mutex_enter(&state->id_ac_mutex);
1985 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1986 		/*
1987 		 * No free ones; try to grab an unreferenced active
1988 		 * one. Maybe we need to make the active list LRU,
1989 		 * but that will create more work for Tx callbacks.
1990 		 * Is there a way of not having to pull out the
1991 		 * entry from the active list, but just indicate it
1992 		 * is being recycled? Yes, but that creates one more
1993 		 * check in the fast lookup path.
1994 		 */
1995 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1996 			/*
1997 			 * Pretty serious shortage now.
1998 			 */
1999 			state->id_ah_op = NOTSTARTED;
2000 			mutex_exit(&state->id_ac_mutex);
2001 			DPRINT(10, "ibd_async_acache : failed to find AH "
2002 			    "slot\n");
2003 			return;
2004 		}
2005 		/*
2006 		 * We could check whether ac_mce points to a SendOnly
2007 		 * member and drop that membership now. Or do it lazily
2008 		 * at detach time.
2009 		 */
2010 		ce->ac_mce = NULL;
2011 	}
2012 	mutex_exit(&state->id_ac_mutex);
2013 	ASSERT(ce->ac_mce == NULL);
2014 
2015 	/*
2016 	 * Update the entry.
2017 	 */
2018 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
2019 
2020 	bzero(&path_info, sizeof (path_info));
2021 	bzero(&path_attr, sizeof (ibt_path_attr_t));
2022 	path_attr.pa_sgid = state->id_sgid;
2023 	path_attr.pa_num_dgids = 1;
2024 	ibd_n2h_gid(&ce->ac_mac, &destgid);
2025 	path_attr.pa_dgids = &destgid;
2026 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2027 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2028 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
2029 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
2030 		goto error;
2031 	}
2032 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
2033 	    ntohl(ce->ac_mac.ipoib_qpn),
2034 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
2035 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
2036 		goto error;
2037 	}
2038 
2039 	/*
2040 	 * mce is set whenever an AH is being associated with a
2041 	 * MCG; this will come in handy when we leave the MCG. The
2042 	 * lock protects Tx fastpath from scanning the active list.
2043 	 */
2044 	if (mce != NULL)
2045 		ce->ac_mce = mce;
2046 	mutex_enter(&state->id_ac_mutex);
2047 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
2048 	state->id_ah_op = ret;
2049 	mutex_exit(&state->id_ac_mutex);
2050 	return;
2051 error:
2052 	/*
2053 	 * We might want to drop SendOnly membership here if we
2054 	 * joined above. The lock protects Tx callbacks inserting
2055 	 * into the free list.
2056 	 */
2057 	mutex_enter(&state->id_ac_mutex);
2058 	state->id_ah_op = ERRORED;
2059 	IBD_ACACHE_INSERT_FREE(state, ce);
2060 	mutex_exit(&state->id_ac_mutex);
2061 }
2062 
2063 /*
2064  * While restoring port's presence on the subnet on a port up, it is possible
2065  * that the port goes down again.
2066  */
2067 static void
2068 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
2069 {
2070 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
2071 	int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN :
2072 	    GLD_LINKSTATE_UP;
2073 	ibd_mce_t *mce, *pmce;
2074 	ibd_ace_t *ace, *pace;
2075 
2076 	DPRINT(10, "ibd_async_link(): %d", opcode);
2077 
2078 	/*
2079 	 * On a link up, revalidate the link speed/width. No point doing
2080 	 * this on a link down, since we will be unable to do SA operations,
2081 	 * defaulting to the lowest speed. Also notice that we update our
2082 	 * notion of speed before calling gld_linkstate(), which will do
2083 	 * neccesary higher level notifications for speed changes.
2084 	 */
2085 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
2086 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2087 		state->id_link_speed = ibd_get_portspeed(state);
2088 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2089 	}
2090 
2091 	/*
2092 	 * Do all the work required to establish our presence on
2093 	 * the subnet.
2094 	 */
2095 	if (opcode == IBD_LINK_UP_ABSENT) {
2096 		/*
2097 		 * If in promiscuous mode ...
2098 		 */
2099 		if (state->id_prom_op == COMPLETED) {
2100 			/*
2101 			 * Drop all nonmembership.
2102 			 */
2103 			ibd_async_unsetprom(state, B_FALSE);
2104 
2105 			/*
2106 			 * Then, try to regain nonmembership to all mcg's.
2107 			 */
2108 			ibd_async_setprom(state, B_FALSE);
2109 
2110 		}
2111 
2112 		/*
2113 		 * Drop all sendonly membership (which also gets rid of the
2114 		 * AHs); try to reacquire all full membership.
2115 		 */
2116 		mce = list_head(&state->id_mc_full);
2117 		while ((pmce = mce) != NULL) {
2118 			mce = list_next(&state->id_mc_full, mce);
2119 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
2120 				ibd_leave_group(state,
2121 				    pmce->mc_info.mc_adds_vect.av_dgid,
2122 				    IB_MC_JSTATE_SEND_ONLY_NON);
2123 			else
2124 				ibd_reacquire_group(state, pmce);
2125 		}
2126 
2127 		/*
2128 		 * Recycle all active AHs to free list (and if there are
2129 		 * pending posts, make sure they will go into the free list
2130 		 * once the Tx's complete). Grab the lock to prevent
2131 		 * concurrent Tx's as well as Tx cleanups.
2132 		 */
2133 		mutex_enter(&state->id_ac_mutex);
2134 		ace = list_head(&state->id_ah_active);
2135 		while ((pace = ace) != NULL) {
2136 			boolean_t cycled;
2137 
2138 			ace = list_next(&state->id_ah_active, ace);
2139 			mce = pace->ac_mce;
2140 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
2141 			    B_TRUE);
2142 			/*
2143 			 * If this is for an mcg, it must be for a fullmember,
2144 			 * since we got rid of send-only members above when
2145 			 * processing the mce list.
2146 			 */
2147 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2148 			    IB_MC_JSTATE_FULL)));
2149 
2150 			/*
2151 			 * Check if the fullmember mce needs to be torn down,
2152 			 * ie whether the DLPI disable has already been done.
2153 			 * If so, do some of the work of tx_cleanup, namely
2154 			 * causing leave (which will fail), detach and
2155 			 * mce-freeing. tx_cleanup will put the AH into free
2156 			 * list. The reason to duplicate some of this
2157 			 * tx_cleanup work is because we want to delete the
2158 			 * AH right now instead of waiting for tx_cleanup, to
2159 			 * force subsequent Tx's to reacquire an AH.
2160 			 */
2161 			if ((mce != NULL) && (mce->mc_fullreap))
2162 				ibd_async_reap_group(state, mce,
2163 				    mce->mc_info.mc_adds_vect.av_dgid,
2164 				    mce->mc_jstate);
2165 		}
2166 		mutex_exit(&state->id_ac_mutex);
2167 	}
2168 
2169 	/*
2170 	 * Macinfo is guaranteed to exist since driver does ibt_close_hca()
2171 	 * (which stops further events from being delivered) before
2172 	 * gld_mac_free(). At this point, it is guaranteed that gld_register
2173 	 * has already been done.
2174 	 */
2175 	mutex_enter(&state->id_link_mutex);
2176 	state->id_link_state = lstate;
2177 	gld_linkstate(state->id_macinfo, lstate);
2178 	mutex_exit(&state->id_link_mutex);
2179 
2180 	/*
2181 	 * Free the request slot allocated by the event thread.
2182 	 */
2183 	kmem_free(req, sizeof (ibd_req_t));
2184 
2185 	ibd_async_done(state);
2186 }
2187 
2188 /*
2189  * When the link is notified up, we need to do a few things, based
2190  * on the port's current p_init_type_reply claiming a reinit has been
2191  * done or not. The reinit steps are:
2192  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2193  *    the old Pkey and GID0 are correct.
2194  * 2. Register for mcg traps (already done by ibmf).
2195  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2196  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2197  * 4. Give up all sendonly memberships.
2198  * 5. Acquire all full memberships.
2199  * 6. In promiscuous mode, acquire all non memberships.
2200  * 7. Recycle all AHs to free list.
2201  */
2202 static void
2203 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2204 {
2205 	ibt_hca_portinfo_t *port_infop;
2206 	ibt_status_t ibt_status;
2207 	uint_t psize, port_infosz;
2208 	ibd_link_op_t opcode;
2209 	ibd_req_t *req;
2210 
2211 	/*
2212 	 * Do not send a request to the async daemon if it has not
2213 	 * yet been created or is being destroyed. If the async
2214 	 * daemon has not yet been created, we still need to track
2215 	 * last known state of the link. If this code races with the
2216 	 * detach path, then we are assured that the detach path has
2217 	 * not yet done the ibt_close_hca (which waits for all async
2218 	 * events to complete). If the code races with the attach path,
2219 	 * we need to validate the pkey/gid (in the link_up case) if
2220 	 * the initialization path has already set these up and created
2221 	 * IBTF resources based on the values.
2222 	 */
2223 	mutex_enter(&state->id_link_mutex);
2224 
2225 	/*
2226 	 * If the init code in ibd_drv_init hasn't yet set up the
2227 	 * pkey/gid, nothing to do; that code will set the link state.
2228 	 */
2229 	if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) {
2230 		mutex_exit(&state->id_link_mutex);
2231 		return;
2232 	}
2233 
2234 	if (code == IBT_EVENT_PORT_UP) {
2235 		uint8_t itreply;
2236 		boolean_t badup = B_FALSE;
2237 
2238 		ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
2239 		    state->id_port, &port_infop, &psize, &port_infosz);
2240 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
2241 			mutex_exit(&state->id_link_mutex);
2242 			DPRINT(10, "ibd_link_up : failed in"
2243 			    " ibt_query_port()\n");
2244 			return;
2245 		}
2246 
2247 		/*
2248 		 * If the link already went down by the time the handler gets
2249 		 * here, give up; we can not even validate pkey/gid since those
2250 		 * are not valid.
2251 		 */
2252 		if (port_infop->p_linkstate != IBT_PORT_ACTIVE)
2253 			badup = B_TRUE;
2254 
2255 		itreply = port_infop->p_init_type_reply;
2256 
2257 		/*
2258 		 * In InitTypeReply, check if NoLoadReply ==
2259 		 * PreserveContentReply == 0, in which case, verify Pkey/GID0.
2260 		 */
2261 		if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2262 		    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
2263 		    (!badup)) {
2264 			/*
2265 			 * Check that the subnet part of GID0 has not changed.
2266 			 */
2267 			if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid,
2268 			    sizeof (ib_gid_t)) != 0)
2269 				badup = B_TRUE;
2270 
2271 			/*
2272 			 * Check that Pkey/index mapping is still valid.
2273 			 */
2274 			if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) ||
2275 			    (port_infop->p_pkey_tbl[state->id_pkix] !=
2276 			    state->id_pkey))
2277 				badup = B_TRUE;
2278 		}
2279 
2280 		/*
2281 		 * In InitTypeReply, if PreservePresenceReply indicates the SM
2282 		 * has ensured that the port's presence in mcg, traps etc is
2283 		 * intact, nothing more to do.
2284 		 */
2285 		opcode = IBD_LINK_UP_ABSENT;
2286 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2287 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY)
2288 			opcode = IBD_LINK_UP;
2289 
2290 		if (badup)
2291 			code = IBT_ERROR_PORT_DOWN;
2292 		ibt_free_portinfo(port_infop, port_infosz);
2293 	}
2294 
2295 	if (!ibd_async_safe(state)) {
2296 		state->id_link_state = ((code == IBT_EVENT_PORT_UP) ?
2297 		    GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN);
2298 		mutex_exit(&state->id_link_mutex);
2299 		return;
2300 	}
2301 	mutex_exit(&state->id_link_mutex);
2302 
2303 	if (code == IBT_ERROR_PORT_DOWN)
2304 		opcode = IBD_LINK_DOWN;
2305 
2306 	req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP);
2307 	req->rq_ptr = (void *)opcode;
2308 	ibd_queue_work_slot(state, req, ASYNC_LINK);
2309 }
2310 
2311 /*
2312  * For the port up/down events, IBTL guarantees there will not be concurrent
2313  * invocations of the handler. IBTL might coalesce link transition events,
2314  * and not invoke the handler for _each_ up/down transition, but it will
2315  * invoke the handler with last known state
2316  */
2317 static void
2318 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2319     ibt_async_code_t code, ibt_async_event_t *event)
2320 {
2321 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2322 
2323 	switch (code) {
2324 	case IBT_ERROR_CATASTROPHIC_CHAN:
2325 		ibd_print_warn(state, "catastrophic channel error");
2326 		break;
2327 	case IBT_ERROR_CQ:
2328 		ibd_print_warn(state, "completion queue error");
2329 		break;
2330 	case IBT_ERROR_PORT_DOWN:
2331 	case IBT_EVENT_PORT_UP:
2332 		/*
2333 		 * Events will be delivered to all instances that have
2334 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2335 		 * Only need to do work for our port; IBTF will deliver
2336 		 * events for other ports on the hca we have ibt_open_hca'ed
2337 		 * too. Note that ibd_drv_init() initializes id_port before
2338 		 * doing ibt_open_hca().
2339 		 */
2340 		ASSERT(state->id_hca_hdl == hca_hdl);
2341 		if (state->id_port != event->ev_port)
2342 			break;
2343 
2344 		ibd_link_mod(state, code);
2345 		break;
2346 
2347 	case IBT_HCA_ATTACH_EVENT:
2348 	case IBT_HCA_DETACH_EVENT:
2349 		/*
2350 		 * When a new card is plugged to the system, attach_event is
2351 		 * invoked. Additionally, a cfgadm needs to be run to make the
2352 		 * card known to the system, and an ifconfig needs to be run to
2353 		 * plumb up any ibd interfaces on the card. In the case of card
2354 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2355 		 * unplumb the ibd interfaces on the card; when the card is
2356 		 * actually unplugged, the detach_event is invoked;
2357 		 * additionally, if any ibd instances are still active on the
2358 		 * card (eg there were no associated RCM scripts), driver's
2359 		 * detach routine is invoked.
2360 		 */
2361 		break;
2362 	default:
2363 		break;
2364 	}
2365 }
2366 
2367 /*
2368  * Attach device to the IO framework.
2369  */
2370 static int
2371 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2372 {
2373 	ibd_state_t *state;
2374 	int instance;
2375 
2376 	switch (cmd) {
2377 		case DDI_ATTACH:
2378 			break;
2379 		case DDI_RESUME:
2380 			/* This driver does not support resume */
2381 		default:
2382 			return (DDI_FAILURE);
2383 	}
2384 
2385 	/*
2386 	 * Allocate soft device data structure
2387 	 */
2388 	instance = ddi_get_instance(dip);
2389 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2390 		return (DDI_FAILURE);
2391 	state = ddi_get_soft_state(ibd_list, instance);
2392 
2393 	/* pre ibt_attach() soft state initialization */
2394 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2395 		DPRINT(10, "ibd_attach : failed in ibd_state_init()");
2396 		goto attach_fail_state_init;
2397 	}
2398 
2399 	/* "attach" to IBTL */
2400 	if (ibt_attach(&ibd_clnt_modinfo, dip, state,
2401 	    &state->id_ibt_hdl) != IBT_SUCCESS) {
2402 		DPRINT(10, "ibd_attach : failed in ibt_attach()");
2403 		goto attach_fail_ibt_attach;
2404 	}
2405 
2406 	/* Finish initializing this driver */
2407 	if (ibd_drv_init(state) != DDI_SUCCESS) {
2408 		DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n");
2409 		goto attach_fail_drv_init;
2410 	}
2411 
2412 	/*
2413 	 *  Register ourselves with the GLD interface
2414 	 *
2415 	 *  gld_register will:
2416 	 *	link us with the GLD module;
2417 	 *	set our ddi_set_driver_private(9F) data to the macinfo ptr;
2418 	 *	save the devinfo pointer in macinfo->gldm_devinfo;
2419 	 *	create the minor device node.
2420 	 */
2421 	if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) {
2422 		DPRINT(10, "ibd_attach : failed in gld_register()");
2423 		goto attach_fail_gld_register;
2424 	}
2425 
2426 	/*
2427 	 * Setup the handler we will use for regular DLPI stuff. Its important
2428 	 * to setup the recv handler after registering with gld. Setting it
2429 	 * before causes at times an incoming packet to be forwarded to gld
2430 	 * before the gld_register. This will result in gld dropping the packet
2431 	 * which is ignored by ibd_rcq_handler, thus failing to re-arm the
2432 	 * tavor events. This will cause tavor_isr on recv path to be not
2433 	 * invoked any further.
2434 	 */
2435 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2436 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
2437 	    IBT_SUCCESS) {
2438 		DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n");
2439 		goto attach_fail_gld_register;
2440 	}
2441 
2442 	/*
2443 	 * Setup the subnet notices handler after we initialize the a/mcaches
2444 	 * and start the async thread, both of which are required for the
2445 	 * trap handler to function properly. Enable the trap handler to
2446 	 * queue requests to the async thread after the gld_register, because
2447 	 * the async daemon invokes gld_sched(), which must be done after
2448 	 * gld_register().
2449 	 */
2450 	ibt_register_subnet_notices(state->id_ibt_hdl,
2451 	    ibd_snet_notices_handler, state);
2452 	mutex_enter(&state->id_trap_lock);
2453 	state->id_trap_stop = B_FALSE;
2454 	mutex_exit(&state->id_trap_lock);
2455 
2456 	/*
2457 	 * Indicate link status to GLD and higher layers. By default,
2458 	 * we assume we are in up state (which must have been true at
2459 	 * least at the time the broadcast mcg's were probed); if there
2460 	 * were any up/down transitions till the time we come here, the
2461 	 * async handler will have updated last known state, which we
2462 	 * use to tell GLD. The async handler will not send any
2463 	 * notifications to GLD till we reach here in the initialization
2464 	 * sequence.
2465 	 */
2466 	mutex_enter(&state->id_link_mutex);
2467 	gld_linkstate(state->id_macinfo, state->id_link_state);
2468 	mutex_exit(&state->id_link_mutex);
2469 
2470 	return (DDI_SUCCESS);
2471 
2472 	/* Attach failure points, cleanup */
2473 attach_fail_gld_register:
2474 	ibd_drv_fini(state);
2475 
2476 attach_fail_drv_init:
2477 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2478 		ibd_print_warn(state, "failed to free IB resources");
2479 
2480 attach_fail_ibt_attach:
2481 	ibd_state_fini(state);
2482 
2483 attach_fail_state_init:
2484 	ddi_soft_state_free(ibd_list, instance);
2485 
2486 	return (DDI_FAILURE);
2487 }
2488 
2489 /*
2490  * Detach device from the IO framework.
2491  */
2492 static int
2493 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2494 {
2495 	ibd_state_t *state;
2496 	int status;
2497 	int instance;
2498 
2499 	switch (cmd) {
2500 		case DDI_DETACH:
2501 			break;
2502 		case DDI_SUSPEND:
2503 		default:
2504 			return (DDI_FAILURE);
2505 	}
2506 
2507 	instance = ddi_get_instance(dip);
2508 	state = ddi_get_soft_state(ibd_list, instance);
2509 
2510 	/*
2511 	 * First, stop receive interrupts; this stops the
2512 	 * driver from handing up buffers to higher layers.
2513 	 * Wait for receive buffers to be returned; give up
2514 	 * after 5 seconds.
2515 	 */
2516 	ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
2517 	status = 50;
2518 	while (state->id_rx_list.dl_bufs_outstanding > 0) {
2519 		delay(drv_usectohz(100000));
2520 		if (--status == 0) {
2521 			DPRINT(2, "ibd_detach : reclaiming failed");
2522 			goto failed;
2523 		}
2524 	}
2525 
2526 	if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) {
2527 		DPRINT(10, "ibd_detach : failed in gld_unregister()");
2528 		goto failed;
2529 	}
2530 
2531 	ibd_drv_fini(state);
2532 
2533 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2534 		ibd_print_warn(state, "failed to free all IB resources at "
2535 		    "driver detach time");
2536 
2537 	ibd_state_fini(state);
2538 	ddi_soft_state_free(ibd_list, instance);
2539 	return (DDI_SUCCESS);
2540 
2541 failed:
2542 	/*
2543 	 * Reap all the Tx/Rx completions that were posted since we
2544 	 * turned off the notification. Turn on notifications. There
2545 	 * is a race in that we do not reap completions that come in
2546 	 * after the poll and before notifications get turned on. That
2547 	 * is okay, the next rx/tx packet will trigger a completion
2548 	 * that will reap any missed completions.
2549 	 */
2550 	ibd_poll_compq(state, state->id_rcq_hdl);
2551 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2552 	return (DDI_FAILURE);
2553 }
2554 
2555 /*
2556  * Pre ibt_attach() driver initialization
2557  */
2558 static int
2559 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2560 {
2561 	gld_mac_info_t *macinfo;
2562 
2563 	if ((macinfo = gld_mac_alloc(dip)) == NULL) {
2564 		DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()");
2565 		return (DDI_FAILURE);
2566 	}
2567 
2568 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2569 	state->id_link_state = GLD_LINKSTATE_UNKNOWN;
2570 
2571 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2572 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2573 	state->id_trap_stop = B_TRUE;
2574 	state->id_trap_inprog = 0;
2575 
2576 	mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL);
2577 	state->id_dip = dip;
2578 	state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP);
2579 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP);
2580 
2581 	state->id_sched_queued = B_FALSE;
2582 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2583 
2584 	state->id_tx_list.dl_head = NULL;
2585 	state->id_tx_list.dl_tail = NULL;
2586 	state->id_tx_list.dl_pending_sends = B_FALSE;
2587 	state->id_tx_list.dl_cnt = 0;
2588 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2589 
2590 	state->id_rx_list.dl_head = NULL;
2591 	state->id_rx_list.dl_tail = NULL;
2592 	state->id_rx_list.dl_bufs_outstanding = 0;
2593 	state->id_rx_list.dl_cnt = 0;
2594 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2595 
2596 	/* Link up various structs for later access */
2597 	macinfo->gldm_private = (caddr_t)state;
2598 	state->id_macinfo = macinfo;
2599 
2600 	/*
2601 	 * Initialize pointers to device specific functions which will be
2602 	 * used by the generic layer.
2603 	 */
2604 	macinfo->gldm_reset = ibd_reset;
2605 	macinfo->gldm_start = ibd_start;
2606 	macinfo->gldm_stop = ibd_stop;
2607 	macinfo->gldm_set_mac_addr = ibd_set_mac_addr;
2608 	macinfo->gldm_set_multicast = ibd_set_multicast;
2609 	macinfo->gldm_set_promiscuous = ibd_set_promiscuous;
2610 	macinfo->gldm_get_stats = ibd_get_stats;
2611 	macinfo->gldm_send = ibd_send;
2612 	macinfo->gldm_intr = ibd_intr;
2613 	macinfo->gldm_mdt_pre = ibd_mdt_pre;
2614 	macinfo->gldm_mdt_send = ibd_mdt_txone;
2615 	macinfo->gldm_mdt_post = ibd_mdt_post;
2616 	macinfo->gldm_mdt_sgl = state->id_max_sqseg;
2617 	macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS;
2618 
2619 	/* Initialize board characteristics needed by the generic layer. */
2620 	macinfo->gldm_ident = "InfiniBand DLPI Driver";
2621 	macinfo->gldm_type = DL_IB;
2622 	macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */
2623 	macinfo->gldm_addrlen = IPOIB_ADDRL;
2624 	macinfo->gldm_saplen = -2;
2625 	macinfo->gldm_capabilities = GLD_CAP_LINKSTATE;
2626 
2627 	/* Other required initialization */
2628 	macinfo->gldm_ppa = ddi_get_instance(dip);
2629 	macinfo->gldm_devinfo = dip;
2630 
2631 	return (DDI_SUCCESS);
2632 }
2633 
2634 /*
2635  * Post ibt_detach() driver deconstruction
2636  */
2637 static void
2638 ibd_state_fini(ibd_state_t *state)
2639 {
2640 	mutex_destroy(&state->id_tx_list.dl_mutex);
2641 	mutex_destroy(&state->id_rx_list.dl_mutex);
2642 	mutex_destroy(&state->id_sched_lock);
2643 	mutex_destroy(&state->id_txcomp_lock);
2644 	kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE);
2645 	kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE);
2646 	cv_destroy(&state->id_trap_cv);
2647 	mutex_destroy(&state->id_trap_lock);
2648 	mutex_destroy(&state->id_link_mutex);
2649 	gld_mac_free(state->id_macinfo);
2650 }
2651 
2652 /*
2653  * Fetch IBA parameters for the network device from IB nexus.
2654  */
2655 static int
2656 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid)
2657 {
2658 	/*
2659 	 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
2660 	 * Note that the default partition is also allowed.
2661 	 */
2662 	state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2663 	    0, "port-pkey", IB_PKEY_INVALID_LIMITED);
2664 	if (state->id_pkey <= IB_PKEY_INVALID_FULL) {
2665 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong"
2666 		    "partition\n");
2667 		return (DDI_FAILURE);
2668 	}
2669 
2670 	/*
2671 	 * ... the IBA port ...
2672 	 */
2673 	state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2674 	    0, "port-number", 0);
2675 	if (state->id_port == 0) {
2676 		DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n");
2677 		return (DDI_FAILURE);
2678 	}
2679 
2680 	/*
2681 	 * ... and HCA GUID.
2682 	 */
2683 	*hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
2684 	    0, "hca-guid", 0);
2685 	if (*hca_guid == 0) {
2686 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong "
2687 		    "guid\n");
2688 		return (DDI_FAILURE);
2689 	}
2690 
2691 	return (DDI_SUCCESS);
2692 }
2693 
2694 /*
2695  * Fetch link speed from SA for snmp ifspeed reporting.
2696  */
2697 static uint64_t
2698 ibd_get_portspeed(ibd_state_t *state)
2699 {
2700 	int			ret;
2701 	uint64_t		ifspeed;
2702 	size_t			length;
2703 	ib_lid_t		lid;
2704 	sa_portinfo_record_t	req, *resp = NULL;
2705 	ibmf_saa_access_args_t	args;
2706 	ibmf_saa_handle_t	saa_handle;
2707 
2708 	/*
2709 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2710 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2711 	 * 2000000000. Start with that as default.
2712 	 */
2713 	ifspeed = 2000000000;
2714 
2715 	/* Get port lid */
2716 	if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL,
2717 	    &lid) != IBT_SUCCESS)
2718 		goto earlydone;
2719 
2720 	if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL,
2721 	    IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS)
2722 		goto earlydone;
2723 
2724 	/* Contact SA Access */
2725 	bzero(&req, sizeof (sa_portinfo_record_t));
2726 	req.EndportLID = lid;
2727 
2728 	args.sq_attr_id		= SA_PORTINFORECORD_ATTRID;
2729 	args.sq_access_type	= IBMF_SAA_RETRIEVE;
2730 	args.sq_component_mask	= SA_PORTINFO_COMPMASK_PORTLID;
2731 	args.sq_template	= &req;
2732 	args.sq_callback	= NULL;
2733 	args.sq_callback_arg	= NULL;
2734 
2735 	ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp);
2736 	if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL))
2737 		goto done;
2738 
2739 	/*
2740 	 * 4X/12X needs appropriate multipliers. With IBA 1.2 additions,
2741 	 * double and quad multipliers are also needed per LinkSpeedEnabled.
2742 	 * In case SA does not return an expected value, report the default
2743 	 * speed as 1X.
2744 	 */
2745 	ret = 1;
2746 	switch (resp->PortInfo.LinkWidthActive) {
2747 		case SM_LINK_WIDTH_ACTIVE_1X:
2748 			ret = 1;
2749 			break;
2750 		case SM_LINK_WIDTH_ACTIVE_4X:
2751 			ret = 4;
2752 			break;
2753 		case SM_LINK_WIDTH_ACTIVE_12X:
2754 			ret = 12;
2755 			break;
2756 	}
2757 	ifspeed *= ret;
2758 	kmem_free(resp, length);
2759 
2760 done:
2761 	(void) ibmf_sa_session_close(&saa_handle, 0);
2762 
2763 earlydone:
2764 	return (ifspeed);
2765 }
2766 
2767 /*
2768  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2769  * representing the input mcg mgid.
2770  */
2771 static ibd_mce_t *
2772 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2773 {
2774 	ibd_mce_t *ptr = list_head(mlist);
2775 
2776 	/*
2777 	 * Do plain linear search.
2778 	 */
2779 	while (ptr != NULL) {
2780 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2781 		    sizeof (ib_gid_t)) == 0)
2782 			return (ptr);
2783 		ptr = list_next(mlist, ptr);
2784 	}
2785 	return (NULL);
2786 }
2787 
2788 /*
2789  * Execute IBA JOIN.
2790  */
2791 static ibt_status_t
2792 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2793 {
2794 	ibt_mcg_attr_t mcg_attr;
2795 
2796 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2797 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2798 	mcg_attr.mc_mgid = mgid;
2799 	mcg_attr.mc_join_state = mce->mc_jstate;
2800 	mcg_attr.mc_scope = state->id_scope;
2801 	mcg_attr.mc_pkey = state->id_pkey;
2802 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2803 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2804 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2805 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2806 	    NULL, NULL));
2807 }
2808 
2809 /*
2810  * This code JOINs the port in the proper way (depending on the join
2811  * state) so that IBA fabric will forward mcg packets to/from the port.
2812  * It also attaches the QPN to the mcg so it can receive those mcg
2813  * packets. This code makes sure not to attach the mcg to the QP if
2814  * that has been previously done due to the mcg being joined with a
2815  * different join state, even though this is not required by SWG_0216,
2816  * refid 3610.
2817  */
2818 static ibd_mce_t *
2819 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2820 {
2821 	ibt_status_t ibt_status;
2822 	ibd_mce_t *mce, *tmce, *omce = NULL;
2823 	boolean_t do_attach = B_TRUE;
2824 
2825 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2826 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2827 
2828 	/*
2829 	 * For enable_multicast Full member joins, we need to do some
2830 	 * extra work. If there is already an mce on the list that
2831 	 * indicates full membership, that means the membership has
2832 	 * not yet been dropped (since the disable_multicast was issued)
2833 	 * because there are pending Tx's to the mcg; in that case, just
2834 	 * mark the mce not to be reaped when the Tx completion queues
2835 	 * an async reap operation.
2836 	 *
2837 	 * If there is already an mce on the list indicating sendonly
2838 	 * membership, try to promote to full membership. Be careful
2839 	 * not to deallocate the old mce, since there might be an AH
2840 	 * pointing to it; instead, update the old mce with new data
2841 	 * that tracks the full membership.
2842 	 */
2843 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2844 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2845 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2846 			ASSERT(omce->mc_fullreap);
2847 			omce->mc_fullreap = B_FALSE;
2848 			return (omce);
2849 		} else {
2850 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2851 		}
2852 	}
2853 
2854 	/*
2855 	 * Allocate the ibd_mce_t to track this JOIN.
2856 	 */
2857 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2858 	mce->mc_fullreap = B_FALSE;
2859 	mce->mc_jstate = jstate;
2860 
2861 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2862 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2863 		    ibt_status);
2864 		kmem_free(mce, sizeof (ibd_mce_t));
2865 		return (NULL);
2866 	}
2867 
2868 	/*
2869 	 * Is an IBA attach required? Not if the interface is already joined
2870 	 * to the mcg in a different appropriate join state.
2871 	 */
2872 	if (jstate == IB_MC_JSTATE_NON) {
2873 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2874 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2875 			do_attach = B_FALSE;
2876 	} else if (jstate == IB_MC_JSTATE_FULL) {
2877 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2878 			do_attach = B_FALSE;
2879 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2880 		do_attach = B_FALSE;
2881 	}
2882 
2883 	if (do_attach) {
2884 		/*
2885 		 * Do the IBA attach.
2886 		 */
2887 		DPRINT(10, "ibd_join_group : ibt_attach_mcg \n");
2888 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2889 		    &mce->mc_info)) != IBT_SUCCESS) {
2890 			DPRINT(10, "ibd_join_group : failed qp attachment "
2891 			    "%d\n", ibt_status);
2892 			/*
2893 			 * NOTE that we should probably preserve the join info
2894 			 * in the list and later try to leave again at detach
2895 			 * time.
2896 			 */
2897 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2898 			    state->id_sgid, jstate);
2899 			kmem_free(mce, sizeof (ibd_mce_t));
2900 			return (NULL);
2901 		}
2902 	}
2903 
2904 	/*
2905 	 * Insert the ibd_mce_t in the proper list.
2906 	 */
2907 	if (jstate == IB_MC_JSTATE_NON) {
2908 		IBD_MCACHE_INSERT_NON(state, mce);
2909 	} else {
2910 		/*
2911 		 * Set up the mc_req fields used for reaping the
2912 		 * mcg in case of delayed tx completion (see
2913 		 * ibd_tx_cleanup()). Also done for sendonly join in
2914 		 * case we are promoted to fullmembership later and
2915 		 * keep using the same mce.
2916 		 */
2917 		mce->mc_req.rq_gid = mgid;
2918 		mce->mc_req.rq_ptr = mce;
2919 		/*
2920 		 * Check whether this is the case of trying to join
2921 		 * full member, and we were already joined send only.
2922 		 * We try to drop our SendOnly membership, but it is
2923 		 * possible that the mcg does not exist anymore (and
2924 		 * the subnet trap never reached us), so the leave
2925 		 * operation might fail.
2926 		 */
2927 		if (omce != NULL) {
2928 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2929 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2930 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2931 			bcopy(&mce->mc_info, &omce->mc_info,
2932 			    sizeof (ibt_mcg_info_t));
2933 			kmem_free(mce, sizeof (ibd_mce_t));
2934 			return (omce);
2935 		}
2936 		mutex_enter(&state->id_mc_mutex);
2937 		IBD_MCACHE_INSERT_FULL(state, mce);
2938 		mutex_exit(&state->id_mc_mutex);
2939 	}
2940 
2941 	return (mce);
2942 }
2943 
2944 /*
2945  * Called during port up event handling to attempt to reacquire full
2946  * membership to an mcg. Stripped down version of ibd_join_group().
2947  * Note that it is possible that the mcg might have gone away, and
2948  * gets recreated at this point.
2949  */
2950 static void
2951 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2952 {
2953 	ib_gid_t mgid;
2954 
2955 	/*
2956 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2957 	 * reap/leave is going to try to leave the group. We could prevent
2958 	 * that by adding a boolean flag into ibd_mce_t, if required.
2959 	 */
2960 	if (mce->mc_fullreap)
2961 		return;
2962 
2963 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2964 
2965 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2966 	    mgid.gid_guid);
2967 
2968 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2969 		ibd_print_warn(state, "Failure on port up to rejoin "
2970 		    "multicast gid %016llx:%016llx",
2971 		    (u_longlong_t)mgid.gid_prefix,
2972 		    (u_longlong_t)mgid.gid_guid);
2973 }
2974 
2975 /*
2976  * This code handles delayed Tx completion cleanups for mcg's to which
2977  * disable_multicast has been issued, regular mcg related cleanups during
2978  * disable_multicast, disable_promiscous and mcg traps, as well as
2979  * cleanups during driver detach time. Depending on the join state,
2980  * it deletes the mce from the appropriate list and issues the IBA
2981  * leave/detach; except in the disable_multicast case when the mce
2982  * is left on the active list for a subsequent Tx completion cleanup.
2983  */
2984 static void
2985 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2986     uint8_t jstate)
2987 {
2988 	ibd_mce_t *tmce;
2989 	boolean_t do_detach = B_TRUE;
2990 
2991 	/*
2992 	 * Before detaching, we must check whether the other list
2993 	 * contains the mcg; if we detach blindly, the consumer
2994 	 * who set up the other list will also stop receiving
2995 	 * traffic.
2996 	 */
2997 	if (jstate == IB_MC_JSTATE_FULL) {
2998 		/*
2999 		 * The following check is only relevant while coming
3000 		 * from the Tx completion path in the reap case.
3001 		 */
3002 		if (!mce->mc_fullreap)
3003 			return;
3004 		mutex_enter(&state->id_mc_mutex);
3005 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3006 		mutex_exit(&state->id_mc_mutex);
3007 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3008 			do_detach = B_FALSE;
3009 	} else if (jstate == IB_MC_JSTATE_NON) {
3010 		IBD_MCACHE_PULLOUT_NON(state, mce);
3011 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3012 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3013 			do_detach = B_FALSE;
3014 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3015 		mutex_enter(&state->id_mc_mutex);
3016 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3017 		mutex_exit(&state->id_mc_mutex);
3018 		do_detach = B_FALSE;
3019 	}
3020 
3021 	/*
3022 	 * If we are reacting to a mcg trap and leaving our sendonly or
3023 	 * non membership, the mcg is possibly already gone, so attempting
3024 	 * to leave might fail. On the other hand, we must try to leave
3025 	 * anyway, since this might be a trap from long ago, and we could
3026 	 * have potentially sendonly joined to a recent incarnation of
3027 	 * the mcg and are about to loose track of this information.
3028 	 */
3029 	if (do_detach) {
3030 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3031 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3032 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3033 	}
3034 
3035 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3036 	kmem_free(mce, sizeof (ibd_mce_t));
3037 }
3038 
3039 /*
3040  * Async code executed due to multicast and promiscuous disable requests
3041  * and mcg trap handling; also executed during driver detach. Mostly, a
3042  * leave and detach is done; except for the fullmember case when Tx
3043  * requests are pending, whence arrangements are made for subsequent
3044  * cleanup on Tx completion.
3045  */
3046 static void
3047 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3048 {
3049 	ipoib_mac_t mcmac;
3050 	boolean_t recycled;
3051 	ibd_mce_t *mce;
3052 
3053 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3054 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3055 
3056 	if (jstate == IB_MC_JSTATE_NON) {
3057 		recycled = B_TRUE;
3058 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3059 		/*
3060 		 * In case we are handling a mcg trap, we might not find
3061 		 * the mcg in the non list.
3062 		 */
3063 		if (mce == NULL)
3064 			return;
3065 	} else {
3066 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3067 
3068 		/*
3069 		 * In case we are handling a mcg trap, make sure the trap
3070 		 * is not arriving late; if we have an mce that indicates
3071 		 * that we are already a fullmember, that would be a clear
3072 		 * indication that the trap arrived late (ie, is for a
3073 		 * previous incarnation of the mcg).
3074 		 */
3075 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3076 			if ((mce == NULL) || (mce->mc_jstate ==
3077 			    IB_MC_JSTATE_FULL))
3078 				return;
3079 			ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3080 		} else {
3081 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3082 			ASSERT((mce != NULL) && (mce->mc_jstate ==
3083 			    IB_MC_JSTATE_FULL));
3084 			mce->mc_fullreap = B_TRUE;
3085 		}
3086 
3087 		/*
3088 		 * If no pending Tx's remain that reference the AH
3089 		 * for the mcg, recycle it from active to free list.
3090 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3091 		 * so the last completing Tx will cause an async reap
3092 		 * operation to be invoked, at which time we will drop our
3093 		 * membership to the mcg so that the pending Tx's complete
3094 		 * successfully. Refer to comments on "AH and MCE active
3095 		 * list manipulation" at top of this file. The lock protects
3096 		 * against Tx fast path and Tx cleanup code.
3097 		 */
3098 		mutex_enter(&state->id_ac_mutex);
3099 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3100 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3101 		    IB_MC_JSTATE_SEND_ONLY_NON));
3102 		mutex_exit(&state->id_ac_mutex);
3103 	}
3104 
3105 	if (recycled) {
3106 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3107 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3108 		ibd_async_reap_group(state, mce, mgid, jstate);
3109 	}
3110 }
3111 
3112 /*
3113  * Find the broadcast address as defined by IPoIB; implicitly
3114  * determines the IBA scope, mtu, tclass etc of the link the
3115  * interface is going to be a member of.
3116  */
3117 static ibt_status_t
3118 ibd_find_bgroup(ibd_state_t *state)
3119 {
3120 	ibt_mcg_attr_t mcg_attr;
3121 	uint_t numg;
3122 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3123 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3124 	    IB_MC_SCOPE_GLOBAL };
3125 	int i, mcgmtu;
3126 	boolean_t found = B_FALSE;
3127 
3128 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3129 	mcg_attr.mc_pkey = state->id_pkey;
3130 	state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK;
3131 
3132 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3133 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3134 
3135 		/*
3136 		 * Look for the IPoIB broadcast group.
3137 		 */
3138 		state->id_mgid.gid_prefix =
3139 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3140 		    ((uint64_t)state->id_scope << 48) |
3141 		    ((uint32_t)(state->id_pkey << 16)));
3142 		mcg_attr.mc_mgid = state->id_mgid;
3143 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3144 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3145 			found = B_TRUE;
3146 			break;
3147 		}
3148 
3149 	}
3150 
3151 	if (!found) {
3152 		ibd_print_warn(state, "IPoIB broadcast group absent");
3153 		return (IBT_FAILURE);
3154 	}
3155 
3156 	/*
3157 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3158 	 */
3159 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3160 	if (state->id_mtu < mcgmtu) {
3161 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3162 		    "greater than port's maximum MTU %d", mcgmtu,
3163 		    state->id_mtu);
3164 		return (IBT_FAILURE);
3165 	}
3166 	state->id_mtu = mcgmtu;
3167 
3168 	return (IBT_SUCCESS);
3169 }
3170 
3171 /*
3172  * Post ibt_attach() initialization.
3173  */
3174 static int
3175 ibd_drv_init(ibd_state_t *state)
3176 {
3177 	kthread_t *kht;
3178 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
3179 	ibt_ud_chan_query_attr_t ud_chan_attr;
3180 	ibt_hca_portinfo_t *port_infop;
3181 	ibt_hca_attr_t hca_attrs;
3182 	ibt_status_t ibt_status;
3183 	ibt_cq_attr_t cq_attr;
3184 	ib_guid_t hca_guid;
3185 	uint32_t real_size;
3186 	uint32_t *ptr;
3187 	char pathname[OBP_MAXPATHLEN];
3188 	uint_t psize, port_infosz;
3189 
3190 	/*
3191 	 * Initialize id_port before ibt_open_hca because of
3192 	 * ordering requirements in port up/down handling.
3193 	 */
3194 	if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS)
3195 		return (DDI_FAILURE);
3196 
3197 	if (ibt_open_hca(state->id_ibt_hdl, hca_guid,
3198 	    &state->id_hca_hdl) != IBT_SUCCESS) {
3199 		DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n");
3200 		return (DDI_FAILURE);
3201 	}
3202 
3203 	mutex_enter(&state->id_link_mutex);
3204 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
3205 	    state->id_port, &port_infop, &psize,
3206 	    &port_infosz);
3207 	if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
3208 		mutex_exit(&state->id_link_mutex);
3209 		DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n");
3210 		(void) ibt_close_hca(state->id_hca_hdl);
3211 		return (DDI_FAILURE);
3212 	}
3213 
3214 	/*
3215 	 * If the link already went down by the time we get here, give up;
3216 	 * we can not even get the gid since that is not valid. We would
3217 	 * fail in ibd_find_bgroup() anyway.
3218 	 */
3219 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
3220 		mutex_exit(&state->id_link_mutex);
3221 		ibt_free_portinfo(port_infop, port_infosz);
3222 		(void) ibt_close_hca(state->id_hca_hdl);
3223 		ibd_print_warn(state, "Port is not active");
3224 		return (DDI_FAILURE);
3225 	}
3226 
3227 	/*
3228 	 * This verifies the Pkey ibnexus handed us is still valid.
3229 	 * This is also the point from which the pkey table for the
3230 	 * port must hold the exact pkey value at the exact index
3231 	 * across port up/downs.
3232 	 */
3233 	if (ibt_pkey2index(state->id_hca_hdl, state->id_port,
3234 	    state->id_pkey, &state->id_pkix) != IBT_SUCCESS) {
3235 		mutex_exit(&state->id_link_mutex);
3236 		ibt_free_portinfo(port_infop, port_infosz);
3237 		DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n");
3238 		(void) ibt_close_hca(state->id_hca_hdl);
3239 		return (DDI_FAILURE);
3240 	}
3241 
3242 	state->id_mtu = (128 << port_infop->p_mtu);
3243 	state->id_sgid = *port_infop->p_sgid_tbl;
3244 	state->id_link_state = GLD_LINKSTATE_UP;
3245 	mutex_exit(&state->id_link_mutex);
3246 
3247 	ibt_free_portinfo(port_infop, port_infosz);
3248 
3249 	state->id_link_speed = ibd_get_portspeed(state);
3250 
3251 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
3252 	ASSERT(ibt_status == IBT_SUCCESS);
3253 
3254 	/*
3255 	 * We need to determine whether the HCA can support checksum
3256 	 * and indicate that to higher layers.
3257 	 */
3258 	if (ibd_csum_send > IBD_CSUM_NONE)
3259 		state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL;
3260 
3261 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
3262 		DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n");
3263 		goto drv_init_fail_find_bgroup;
3264 	}
3265 	state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE;
3266 
3267 	if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
3268 	    &state->id_pd_hdl) != IBT_SUCCESS) {
3269 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n");
3270 		goto drv_init_fail_alloc_pd;
3271 	}
3272 
3273 	/* Initialize the parallel ARP cache and AHs */
3274 	if (ibd_acache_init(state) != DDI_SUCCESS) {
3275 		DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n");
3276 		goto drv_init_fail_acache;
3277 	}
3278 
3279 	/*
3280 	 * Check various tunable limits.
3281 	 */
3282 	if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) {
3283 		ibd_print_warn(state, "Setting #sgl = %d instead of default %d",
3284 		    hca_attrs.hca_max_sgl, IBD_MAX_SQSEG);
3285 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
3286 	} else {
3287 		state->id_max_sqseg = IBD_MAX_SQSEG;
3288 	}
3289 
3290 	/*
3291 	 * First, check #r/s wqes against max channel size.
3292 	 */
3293 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE)
3294 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
3295 	else
3296 		state->id_num_rwqe = IBD_NUM_RWQE;
3297 
3298 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE)
3299 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
3300 	else
3301 		state->id_num_swqe = IBD_NUM_SWQE;
3302 
3303 	/*
3304 	 * Allocate Rx/combined CQ:
3305 	 * Theoretically, there is no point in having more than #rwqe
3306 	 * plus #swqe cqe's, except that the CQ will be signalled for
3307 	 * overflow when the last wqe completes, if none of the previous
3308 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
3309 	 * to make sure such overflow does not occur.
3310 	 */
3311 	cq_attr.cq_sched = NULL;
3312 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
3313 
3314 	if (ibd_separate_cqs == 1) {
3315 		/*
3316 		 * Allocate Receive CQ.
3317 		 */
3318 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
3319 			cq_attr.cq_size = state->id_num_rwqe + 1;
3320 		} else {
3321 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3322 			state->id_num_rwqe = cq_attr.cq_size - 1;
3323 		}
3324 
3325 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3326 			ibd_print_warn(state, "Computed #rwqe %d based on "
3327 			    "requested size and supportable CQ size is less "
3328 			    "than the required threshold %d",
3329 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3330 			goto drv_init_fail_min_rwqes;
3331 		}
3332 
3333 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3334 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3335 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3336 			goto drv_init_fail_alloc_rcq;
3337 		}
3338 
3339 		/*
3340 		 * Allocate Send CQ.
3341 		 */
3342 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
3343 			cq_attr.cq_size = state->id_num_swqe + 1;
3344 		} else {
3345 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3346 			state->id_num_swqe = cq_attr.cq_size - 1;
3347 		}
3348 
3349 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3350 		    &state->id_scq_hdl, &real_size) != IBT_SUCCESS) {
3351 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3352 			goto drv_init_fail_alloc_scq;
3353 		}
3354 	} else {
3355 		/*
3356 		 * Allocate combined Send/Receive CQ.
3357 		 */
3358 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
3359 		    state->id_num_swqe + 1)) {
3360 			cq_attr.cq_size = state->id_num_rwqe +
3361 			    state->id_num_swqe + 1;
3362 		} else {
3363 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3364 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
3365 			    state->id_num_rwqe) / (state->id_num_rwqe +
3366 			    state->id_num_swqe);
3367 			state->id_num_swqe = cq_attr.cq_size - 1 -
3368 			    state->id_num_rwqe;
3369 		}
3370 
3371 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3372 			ibd_print_warn(state, "Computed #rwqe %d based on "
3373 			    "requested size and supportable CQ size is less "
3374 			    "than the required threshold %d",
3375 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3376 			goto drv_init_fail_min_rwqes;
3377 		}
3378 
3379 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3380 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3381 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3382 			goto drv_init_fail_alloc_rcq;
3383 		}
3384 		state->id_scq_hdl = state->id_rcq_hdl;
3385 	}
3386 
3387 	/*
3388 	 * Print message in case we could not allocate as many wqe's
3389 	 * as was requested. Note that in the combined CQ case, we will
3390 	 * get the following message.
3391 	 */
3392 	if (state->id_num_rwqe != IBD_NUM_RWQE)
3393 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
3394 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
3395 	if (state->id_num_swqe != IBD_NUM_SWQE)
3396 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
3397 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
3398 
3399 	ud_alloc_attr.ud_flags	= IBT_WR_SIGNALED;
3400 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
3401 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
3402 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
3403 	ud_alloc_attr.ud_sizes.cs_sq	= state->id_num_swqe;
3404 	ud_alloc_attr.ud_sizes.cs_rq	= state->id_num_rwqe;
3405 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
3406 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
3407 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
3408 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
3409 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
3410 	ud_alloc_attr.ud_clone_chan	= NULL;
3411 	if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
3412 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) {
3413 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()"
3414 		    "\n");
3415 		goto drv_init_fail_alloc_chan;
3416 	}
3417 
3418 	if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) !=
3419 	    DDI_SUCCESS) {
3420 		DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()");
3421 		goto drv_init_fail_query_chan;
3422 	}
3423 	state->id_qpnum = ud_chan_attr.ud_qpn;
3424 
3425 	/* Initialize the Transmit buffer list */
3426 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
3427 		DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n");
3428 		goto drv_init_fail_txlist_init;
3429 	}
3430 
3431 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
3432 		/* Setup the handler we will use for regular DLPI stuff */
3433 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
3434 		if (ibt_enable_cq_notify(state->id_scq_hdl,
3435 		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
3436 			DPRINT(10, "ibd_drv_init : failed in"
3437 			    " ibt_enable_cq_notify()\n");
3438 			goto drv_init_fail_cq_notify;
3439 		}
3440 	}
3441 
3442 	/* Create the service fifos before we start receiving */
3443 	if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos,
3444 	    state)) == NULL) {
3445 		DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n");
3446 		goto drv_init_fail_srv_fifo;
3447 	}
3448 
3449 	/* Initialize the Receive buffer list */
3450 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
3451 		DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n");
3452 		goto drv_init_fail_rxlist_init;
3453 	}
3454 
3455 	/* Join to IPoIB broadcast group as required by IPoIB */
3456 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
3457 		DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n");
3458 		goto drv_init_fail_join_group;
3459 	}
3460 
3461 	/* Create the async thread */
3462 	if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
3463 	    TS_RUN, minclsyspri)) == NULL) {
3464 		/* Do we have to specially leave the group? */
3465 		DPRINT(10, "ibd_drv_init : failed in thread_create\n");
3466 		goto drv_init_fail_thread_create;
3467 	}
3468 	state->id_async_thrid = kht->t_did;
3469 
3470 	/*
3471 	 * The local mac address is now known. Create the IPoIB
3472 	 * address.
3473 	 */
3474 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
3475 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3476 	state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr;
3477 
3478 	/*
3479 	 * Similarly, program in the broadcast mac address.
3480 	 */
3481 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix,
3482 	    state->id_mgid.gid_guid);
3483 	state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr;
3484 
3485 	ptr = (uint32_t *)&state->id_macaddr;
3486 	DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n",
3487 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3488 	ptr = (uint32_t *)&state->id_bcaddr;
3489 	DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n",
3490 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3491 	DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n",
3492 	    state->id_pkey, state->id_mgid.gid_prefix,
3493 	    state->id_mgid.gid_guid);
3494 	DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n",
3495 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3496 	DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey);
3497 	DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu);
3498 	(void) ddi_pathname(state->id_dip, pathname);
3499 	DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname);
3500 
3501 	return (DDI_SUCCESS);
3502 
3503 drv_init_fail_thread_create:
3504 	ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL);
3505 
3506 drv_init_fail_join_group:
3507 	ibd_fini_rxlist(state);
3508 
3509 drv_init_fail_rxlist_init:
3510 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3511 
3512 drv_init_fail_srv_fifo:
3513 drv_init_fail_cq_notify:
3514 	ibd_fini_txlist(state);
3515 
3516 drv_init_fail_txlist_init:
3517 drv_init_fail_query_chan:
3518 	if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS)
3519 		DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()");
3520 
3521 drv_init_fail_alloc_chan:
3522 	if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) !=
3523 	    IBT_SUCCESS))
3524 		DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()");
3525 
3526 drv_init_fail_alloc_scq:
3527 	if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS)
3528 		DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()");
3529 
3530 drv_init_fail_min_rwqes:
3531 drv_init_fail_alloc_rcq:
3532 	ibd_acache_fini(state);
3533 drv_init_fail_acache:
3534 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3535 		DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()");
3536 
3537 drv_init_fail_alloc_pd:
3538 	ibt_free_mcg_info(state->id_mcinfo, 1);
3539 drv_init_fail_find_bgroup:
3540 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
3541 		DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()");
3542 
3543 	return (DDI_FAILURE);
3544 }
3545 
3546 /*
3547  * Allocate the statically allocated Tx buffer list.
3548  */
3549 static int
3550 ibd_init_txlist(ibd_state_t *state)
3551 {
3552 	ibd_swqe_t *swqe;
3553 	int i;
3554 
3555 	for (i = 0; i < state->id_num_swqe; i++) {
3556 		if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) {
3557 			DPRINT(10, "ibd_init_txlist : failed in "
3558 			    "ibd_alloc_swqe()\n");
3559 			ibd_fini_txlist(state);
3560 			return (DDI_FAILURE);
3561 		}
3562 
3563 		/* add to list */
3564 		state->id_tx_list.dl_cnt++;
3565 		if (state->id_tx_list.dl_head == NULL) {
3566 			swqe->swqe_prev = NULL;
3567 			swqe->swqe_next = NULL;
3568 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3569 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3570 		} else {
3571 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3572 			swqe->swqe_next = NULL;
3573 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3574 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3575 		}
3576 	}
3577 
3578 	return (DDI_SUCCESS);
3579 }
3580 
3581 /*
3582  * Free the statically allocated Tx buffer list.
3583  */
3584 static void
3585 ibd_fini_txlist(ibd_state_t *state)
3586 {
3587 	ibd_swqe_t *node;
3588 
3589 	mutex_enter(&state->id_tx_list.dl_mutex);
3590 	while (state->id_tx_list.dl_head != NULL) {
3591 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3592 		state->id_tx_list.dl_head = node->swqe_next;
3593 		state->id_tx_list.dl_cnt--;
3594 		ASSERT(state->id_tx_list.dl_cnt >= 0);
3595 		ibd_free_swqe(state, node);
3596 	}
3597 	mutex_exit(&state->id_tx_list.dl_mutex);
3598 }
3599 
3600 /*
3601  * Allocate a single send wqe and register it so it is almost
3602  * ready to be posted to the hardware.
3603  */
3604 static int
3605 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe)
3606 {
3607 	ibt_mr_attr_t mem_attr;
3608 	ibd_swqe_t *swqe;
3609 
3610 	swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP);
3611 	*wqe = swqe;
3612 	swqe->swqe_type = IBD_WQE_SEND;
3613 	swqe->swqe_next = NULL;
3614 	swqe->swqe_prev = NULL;
3615 	swqe->swqe_im_mblk = NULL;
3616 	swqe->w_mdtinfo = NULL;
3617 
3618 	/* alloc copy buffer, must be max size to handle multiple mblk case */
3619 	swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP);
3620 
3621 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3622 	mem_attr.mr_len = state->id_mtu;
3623 	mem_attr.mr_as = NULL;
3624 	mem_attr.mr_flags = IBT_MR_SLEEP;
3625 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3626 	    &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) !=
3627 	    IBT_SUCCESS) {
3628 		DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()");
3629 		kmem_free(swqe->swqe_copybuf.ic_bufaddr,
3630 		    state->id_mtu);
3631 		kmem_free(swqe, sizeof (ibd_swqe_t));
3632 		return (DDI_FAILURE);
3633 	}
3634 
3635 	swqe->swqe_copybuf.ic_sgl.ds_va =
3636 	    (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3637 	swqe->swqe_copybuf.ic_sgl.ds_key =
3638 	    swqe->swqe_copybuf.ic_mr_desc.md_lkey;
3639 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3640 
3641 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3642 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3643 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3644 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3645 
3646 	/* These are set in send */
3647 	swqe->w_swr.wr_nds = 0;
3648 	swqe->w_swr.wr_sgl = NULL;
3649 
3650 	return (DDI_SUCCESS);
3651 }
3652 
3653 /*
3654  * Free an allocated send wqe.
3655  */
3656 static void
3657 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3658 {
3659 
3660 	if (ibt_deregister_mr(state->id_hca_hdl,
3661 	    swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3662 		DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()");
3663 		return;
3664 	}
3665 	kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu);
3666 	kmem_free(swqe, sizeof (ibd_swqe_t));
3667 }
3668 
3669 /*
3670  * Post a rwqe to the hardware and add it to the Rx list. The
3671  * "recycle" parameter indicates whether an old rwqe is being
3672  * recycled, or this is a new one.
3673  */
3674 static int
3675 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3676 {
3677 	if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) !=
3678 	    IBT_SUCCESS) {
3679 		DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()");
3680 		return (DDI_FAILURE);
3681 	}
3682 	atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3683 
3684 	/*
3685 	 * Buffers being recycled are already in the list.
3686 	 */
3687 	if (recycle)
3688 		return (DDI_SUCCESS);
3689 
3690 	mutex_enter(&state->id_rx_list.dl_mutex);
3691 	if (state->id_rx_list.dl_head == NULL) {
3692 		rwqe->rwqe_prev = NULL;
3693 		rwqe->rwqe_next = NULL;
3694 		state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3695 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3696 	} else {
3697 		rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3698 		rwqe->rwqe_next = NULL;
3699 		state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3700 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3701 	}
3702 	mutex_exit(&state->id_rx_list.dl_mutex);
3703 
3704 	return (DDI_SUCCESS);
3705 }
3706 
3707 /*
3708  * Allocate the statically allocated Rx buffer list.
3709  */
3710 static int
3711 ibd_init_rxlist(ibd_state_t *state)
3712 {
3713 	ibd_rwqe_t *rwqe;
3714 	int i;
3715 
3716 	for (i = 0; i < state->id_num_rwqe; i++) {
3717 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3718 			ibd_fini_rxlist(state);
3719 			return (DDI_FAILURE);
3720 		}
3721 
3722 		if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) {
3723 			ibd_free_rwqe(state, rwqe);
3724 			ibd_fini_rxlist(state);
3725 			return (DDI_FAILURE);
3726 		}
3727 	}
3728 
3729 	return (DDI_SUCCESS);
3730 }
3731 
3732 /*
3733  * Free the statically allocated Rx buffer list.
3734  *
3735  */
3736 static void
3737 ibd_fini_rxlist(ibd_state_t *state)
3738 {
3739 	ibd_rwqe_t *node;
3740 
3741 	mutex_enter(&state->id_rx_list.dl_mutex);
3742 	while (state->id_rx_list.dl_head != NULL) {
3743 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3744 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3745 		state->id_rx_list.dl_cnt--;
3746 		ASSERT(state->id_rx_list.dl_cnt >= 0);
3747 
3748 		ibd_free_rwqe(state, node);
3749 	}
3750 	mutex_exit(&state->id_rx_list.dl_mutex);
3751 }
3752 
3753 /*
3754  * Allocate a single recv wqe and register it so it is almost
3755  * ready to be posted to the hardware.
3756  */
3757 static int
3758 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3759 {
3760 	ibt_mr_attr_t mem_attr;
3761 	ibd_rwqe_t *rwqe;
3762 
3763 	if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3764 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3765 		return (DDI_FAILURE);
3766 	}
3767 	*wqe = rwqe;
3768 	rwqe->rwqe_type = IBD_WQE_RECV;
3769 	rwqe->w_state = state;
3770 	rwqe->rwqe_next = NULL;
3771 	rwqe->rwqe_prev = NULL;
3772 	rwqe->w_freeing_wqe = B_FALSE;
3773 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3774 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3775 
3776 	if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3777 	    IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) {
3778 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2");
3779 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3780 		return (DDI_FAILURE);
3781 	}
3782 
3783 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3784 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3785 	    NULL) {
3786 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3787 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3788 		    state->id_mtu + IPOIB_GRH_SIZE);
3789 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3790 		return (DDI_FAILURE);
3791 	}
3792 
3793 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3794 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3795 	mem_attr.mr_as = NULL;
3796 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3797 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3798 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3799 	    IBT_SUCCESS) {
3800 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3801 		rwqe->w_freeing_wqe = B_TRUE;
3802 		freemsg(rwqe->rwqe_im_mblk);
3803 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3804 		    state->id_mtu + IPOIB_GRH_SIZE);
3805 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3806 		return (DDI_FAILURE);
3807 	}
3808 
3809 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3810 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3811 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3812 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3813 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3814 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3815 	rwqe->w_rwr.wr_nds = 1;
3816 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3817 
3818 	return (DDI_SUCCESS);
3819 }
3820 
3821 /*
3822  * Free an allocated recv wqe.
3823  */
3824 static void
3825 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3826 {
3827 
3828 	if (ibt_deregister_mr(state->id_hca_hdl,
3829 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3830 		DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()");
3831 		return;
3832 	}
3833 
3834 	/*
3835 	 * Indicate to the callback function that this rwqe/mblk
3836 	 * should not be recycled. The freemsg() will invoke
3837 	 * ibd_freemsg_cb().
3838 	 */
3839 	if (rwqe->rwqe_im_mblk != NULL) {
3840 		rwqe->w_freeing_wqe = B_TRUE;
3841 		freemsg(rwqe->rwqe_im_mblk);
3842 	}
3843 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3844 	    state->id_mtu + IPOIB_GRH_SIZE);
3845 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3846 }
3847 
3848 /*
3849  * Delete the rwqe being freed from the rx list.
3850  */
3851 static void
3852 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3853 {
3854 	mutex_enter(&state->id_rx_list.dl_mutex);
3855 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3856 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3857 	else
3858 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3859 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3860 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3861 	else
3862 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3863 	mutex_exit(&state->id_rx_list.dl_mutex);
3864 }
3865 
3866 /*
3867  * Pre ibt_detach() deconstruction.
3868  */
3869 static void
3870 ibd_drv_fini(ibd_state_t *state)
3871 {
3872 	ib_gid_t mgid;
3873 	ibd_mce_t *mce;
3874 	ibt_status_t status;
3875 	uint8_t jstate;
3876 
3877 	/*
3878 	 * Desubscribe from trap notices; we will be tearing down
3879 	 * the mcg lists soon. Make sure the trap handler does nothing
3880 	 * even if it is invoked (ie till we invoke ibt_detach()).
3881 	 */
3882 	ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
3883 	mutex_enter(&state->id_trap_lock);
3884 	state->id_trap_stop = B_TRUE;
3885 	while (state->id_trap_inprog > 0)
3886 		cv_wait(&state->id_trap_cv, &state->id_trap_lock);
3887 	mutex_exit(&state->id_trap_lock);
3888 
3889 	/*
3890 	 * Flushing the channel ensures that all pending WQE's
3891 	 * are marked with flush_error and handed to the CQ. It
3892 	 * does not guarantee the invocation of the CQ handler.
3893 	 * This call is guaranteed to return successfully for UD QPNs.
3894 	 */
3895 	status = ibt_flush_channel(state->id_chnl_hdl);
3896 	ASSERT(status == IBT_SUCCESS);
3897 
3898 	/*
3899 	 * We possibly need a loop here to wait for all the Tx
3900 	 * callbacks to happen. The Tx handlers will retrieve
3901 	 * held resources like AH ac_ref count, registered memory
3902 	 * and possibly ASYNC_REAP requests. Rx interrupts were already
3903 	 * turned off (in ibd_detach()); turn off Tx interrupts and
3904 	 * poll. By the time the polling returns an empty indicator,
3905 	 * we are sure we have seen all pending Tx callbacks. Note
3906 	 * that after the ibt_set_cq_handler() returns, the old handler
3907 	 * is guaranteed not to be invoked anymore.
3908 	 */
3909 	if (ibd_separate_cqs == 1)
3910 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
3911 	ibd_poll_compq(state, state->id_scq_hdl);
3912 
3913 	/*
3914 	 * No more async requests will be posted since the device has been
3915 	 * unregistered; completion handlers have been turned off, so Tx
3916 	 * handler will not cause any more ASYNC_REAP requests. Queue a
3917 	 * request for the async thread to exit, which will be serviced
3918 	 * after any pending ones. This can take a while, specially if the
3919 	 * SM is unreachable, since IBMF will slowly timeout each SM request
3920 	 * issued by the async thread. Reap the thread before continuing on,
3921 	 * we do not want it to be lingering in modunloaded code.
3922 	 */
3923 	ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT);
3924 	thread_join(state->id_async_thrid);
3925 
3926 	/*
3927 	 * We can not be in promiscuous mode anymore, upper layers
3928 	 * would have made a request to disable it (if ever set previously)
3929 	 * before the detach is allowed to progress to this point; and the
3930 	 * aysnc thread would have processed that request by now. Thus the
3931 	 * nonmember list is guaranteed empty at this point.
3932 	 */
3933 	ASSERT(state->id_prom_op != COMPLETED);
3934 
3935 	/*
3936 	 * Drop all residual full/non membership. This includes full
3937 	 * membership to the broadcast group, and any nonmembership
3938 	 * acquired during transmits. We do this after the Tx completion
3939 	 * handlers are done, since those might result in some late
3940 	 * leaves; this also eliminates a potential race with that
3941 	 * path wrt the mc full list insert/delete. Trap handling
3942 	 * has also been suppressed at this point. Thus, no locks
3943 	 * are required while traversing the mc full list.
3944 	 */
3945 	DPRINT(2, "ibd_drv_fini : clear full cache entries");
3946 	mce = list_head(&state->id_mc_full);
3947 	while (mce != NULL) {
3948 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
3949 		jstate = mce->mc_jstate;
3950 		mce = list_next(&state->id_mc_full, mce);
3951 		ibd_leave_group(state, mgid, jstate);
3952 	}
3953 
3954 	ibt_free_mcg_info(state->id_mcinfo, 1);
3955 
3956 	/*
3957 	 * Kill the channel now; guaranteed to return successfully
3958 	 * for UD QPNs.
3959 	 */
3960 	status = ibt_free_channel(state->id_chnl_hdl);
3961 	ASSERT(status == IBT_SUCCESS);
3962 
3963 	/*
3964 	 * Kill the CQ; all completion handlers are guaranteed to
3965 	 * have terminated by the time this returns. Since we killed
3966 	 * the QPN above, we can not receive the IBT_CQ_BUSY error.
3967 	 */
3968 	status = ibt_free_cq(state->id_rcq_hdl);
3969 	ASSERT(status == IBT_SUCCESS);
3970 
3971 	if (ibd_separate_cqs == 1) {
3972 		status = ibt_free_cq(state->id_scq_hdl);
3973 		ASSERT(status == IBT_SUCCESS);
3974 	}
3975 
3976 	/*
3977 	 * We killed the receive interrupts, thus, we will not be
3978 	 * required to handle received packets anymore. Thus, kill
3979 	 * service threads since they are not going to be used anymore.
3980 	 */
3981 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3982 
3983 	/*
3984 	 * Since these following will act on the Rx/Tx list, which
3985 	 * is also looked at by the Rx/Tx handlers, keep them around
3986 	 * till all handlers are guaranteed to have completed.
3987 	 */
3988 	ibd_fini_rxlist(state);
3989 	ibd_fini_txlist(state);
3990 
3991 	/*
3992 	 * Clean up the active AH hash list.
3993 	 */
3994 	mod_hash_destroy_hash(state->id_ah_active_hash);
3995 
3996 	/*
3997 	 * Free parallel ARP cache and AHs; we are sure all of these
3998 	 * resources have been released by the Tx completion handler.
3999 	 */
4000 	ibd_acache_fini(state);
4001 
4002 	/*
4003 	 * We freed the QPN, all the MRs and AHs. This step should not
4004 	 * fail; print a warning message if it does fail, due to a bug
4005 	 * in the driver.
4006 	 */
4007 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
4008 		ibd_print_warn(state, "failed to free protection domain");
4009 
4010 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
4011 		ibd_print_warn(state, "failed to close HCA device");
4012 }
4013 
4014 /*
4015  * IBA Rx/Tx completion queue handler. Guaranteed to be single
4016  * threaded and nonreentrant for this CQ. When using combined CQ,
4017  * this handles Tx and Rx completions. With separate CQs, this handles
4018  * only Rx completions.
4019  */
4020 /* ARGSUSED */
4021 static void
4022 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4023 {
4024 	ibd_state_t *state = (ibd_state_t *)arg;
4025 
4026 	atomic_add_64(&state->id_num_intrs, 1);
4027 	(void) gld_intr(state->id_macinfo);
4028 }
4029 
4030 /*
4031  * Separate CQ handler for Tx completions, when the Tx CQ is in
4032  * interrupt driven mode.
4033  */
4034 /* ARGSUSED */
4035 static void
4036 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4037 {
4038 	ibd_state_t *state = (ibd_state_t *)arg;
4039 
4040 	atomic_add_64(&state->id_num_intrs, 1);
4041 
4042 	/*
4043 	 * Poll for completed entries; the CQ will not interrupt any
4044 	 * more for completed packets.
4045 	 */
4046 	ibd_poll_compq(state, state->id_scq_hdl);
4047 
4048 	/*
4049 	 * Now enable CQ notifications; all completions originating now
4050 	 * will cause new interrupts.
4051 	 */
4052 	if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) !=
4053 	    IBT_SUCCESS) {
4054 		/*
4055 		 * We do not expect a failure here.
4056 		 */
4057 		DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
4058 	}
4059 
4060 	/*
4061 	 * Repoll to catch all packets that might have completed after
4062 	 * we finished the first poll loop and before interrupts got
4063 	 * armed.
4064 	 */
4065 	ibd_poll_compq(state, state->id_scq_hdl);
4066 }
4067 
4068 /*
4069  * Multicast group create/delete trap handler. These will be delivered
4070  * on a kernel thread (handling can thus block) and can be invoked
4071  * concurrently. The handler can be invoked anytime after it is
4072  * registered and before ibt_detach().
4073  */
4074 /* ARGSUSED */
4075 static void
4076 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4077     ibt_subnet_event_t *event)
4078 {
4079 	ibd_state_t *state = (ibd_state_t *)arg;
4080 	ibd_req_t *req;
4081 
4082 	/*
4083 	 * The trap handler will get invoked once for every event for
4084 	 * evert port. The input "gid" is the GID0 of the port the
4085 	 * trap came in on; we just need to act on traps that came
4086 	 * to our port, meaning the port on which the ipoib interface
4087 	 * resides. Since ipoib uses GID0 of the port, we just match
4088 	 * the gids to check whether we need to handle the trap.
4089 	 */
4090 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4091 		return;
4092 
4093 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4094 
4095 	switch (code) {
4096 		case IBT_SM_EVENT_UNAVAILABLE:
4097 			/*
4098 			 * If we are in promiscuous mode or have
4099 			 * sendnonmembers, we need to print a warning
4100 			 * message right now. Else, just store the
4101 			 * information, print when we enter promiscuous
4102 			 * mode or attempt nonmember send. We might
4103 			 * also want to stop caching sendnonmember.
4104 			 */
4105 			ibd_print_warn(state, "IBA multicast support "
4106 			    "degraded due to unavailability of multicast "
4107 			    "traps");
4108 			break;
4109 		case IBT_SM_EVENT_AVAILABLE:
4110 			/*
4111 			 * If we printed a warning message above or
4112 			 * while trying to nonmember send or get into
4113 			 * promiscuous mode, print an okay message.
4114 			 */
4115 			ibd_print_warn(state, "IBA multicast support "
4116 			    "restored due to availability of multicast "
4117 			    "traps");
4118 			break;
4119 		case IBT_SM_EVENT_MCG_CREATED:
4120 		case IBT_SM_EVENT_MCG_DELETED:
4121 			/*
4122 			 * Common processing of creation/deletion traps.
4123 			 * First check if the instance is being
4124 			 * [de]initialized; back off then, without doing
4125 			 * anything more, since we are not sure if the
4126 			 * async thread is around, or whether we might
4127 			 * be racing with the detach code in ibd_drv_fini()
4128 			 * that scans the mcg list.
4129 			 */
4130 			if (!ibd_async_safe(state))
4131 				return;
4132 
4133 			req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP);
4134 			req->rq_gid = event->sm_notice_gid;
4135 			req->rq_ptr = (void *)code;
4136 			ibd_queue_work_slot(state, req, ASYNC_TRAP);
4137 			break;
4138 	}
4139 }
4140 
4141 static void
4142 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4143 {
4144 	ib_gid_t mgid = req->rq_gid;
4145 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4146 
4147 	DPRINT(10, "ibd_async_trap : %d\n", code);
4148 
4149 	/*
4150 	 * Atomically search the nonmember and sendonlymember lists and
4151 	 * delete.
4152 	 */
4153 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4154 
4155 	if (state->id_prom_op == COMPLETED) {
4156 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4157 
4158 		/*
4159 		 * If in promiscuous mode, try to join/attach to the new
4160 		 * mcg. Given the unreliable out-of-order mode of trap
4161 		 * delivery, we can never be sure whether it is a problem
4162 		 * if the join fails. Thus, we warn the admin of a failure
4163 		 * if this was a creation trap. Note that the trap might
4164 		 * actually be reporting a long past event, and the mcg
4165 		 * might already have been deleted, thus we might be warning
4166 		 * in vain.
4167 		 */
4168 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4169 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4170 			ibd_print_warn(state, "IBA promiscuous mode missed "
4171 			    "new multicast gid %016llx:%016llx",
4172 			    (u_longlong_t)mgid.gid_prefix,
4173 			    (u_longlong_t)mgid.gid_guid);
4174 	}
4175 
4176 	/*
4177 	 * Free the request slot allocated by the subnet event thread.
4178 	 */
4179 	kmem_free(req, sizeof (ibd_req_t));
4180 
4181 	ibd_async_done(state);
4182 }
4183 
4184 /*
4185  * GLD entry point to reset hardware.
4186  */
4187 /* ARGSUSED */
4188 static int
4189 ibd_reset(gld_mac_info_t *macinfo)
4190 {
4191 	/*
4192 	 * This will be invoked from Style 1 open() and Style 2
4193 	 * attach() routines, ie just before the interface starts
4194 	 * getting used.
4195 	 */
4196 	return (GLD_SUCCESS);
4197 }
4198 
4199 /*
4200  * GLD entry point to start hardware.
4201  */
4202 /* ARGSUSED */
4203 static int
4204 ibd_start(gld_mac_info_t *macinfo)
4205 {
4206 	return (GLD_SUCCESS);
4207 }
4208 
4209 /*
4210  * GLD entry point to stop hardware from receiving packets.
4211  */
4212 /* ARGSUSED */
4213 static int
4214 ibd_stop(gld_mac_info_t *macinfo)
4215 {
4216 #ifdef RUN_PERFORMANCE
4217 	ibd_perf((ibd_state_t *)macinfo->gldm_private);
4218 #endif
4219 	return (GLD_SUCCESS);
4220 }
4221 
4222 /*
4223  * GLD entry point to modify device's mac address. We do not
4224  * allow address modifications.
4225  */
4226 static int
4227 ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr)
4228 {
4229 	ibd_state_t *state;
4230 
4231 	state = (ibd_state_t *)macinfo->gldm_private;
4232 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4233 		return (GLD_SUCCESS);
4234 	else
4235 		return (GLD_FAILURE);
4236 }
4237 
4238 /*
4239  * The blocking part of the IBA join/leave operations are done out
4240  * of here on the async thread.
4241  */
4242 static void
4243 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4244 {
4245 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4246 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4247 
4248 	if (op == ASYNC_JOIN) {
4249 		int ret = ERRORED;
4250 
4251 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL)
4252 			ret = COMPLETED;
4253 
4254 		state->id_multi_op = ret;
4255 	} else {
4256 		/*
4257 		 * Here, we must search for the proper mcg_info and
4258 		 * use that to leave the group.
4259 		 */
4260 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4261 	}
4262 }
4263 
4264 /*
4265  * GLD entry point for multicast enable/disable requests.
4266  * Invoked by GLD only on the first multicast enable for a specific
4267  * address (GLD is free to retry ocassionally if we return RETRY),
4268  * and on last disable of the same address. Just queue the operation
4269  * to the async thread.
4270  */
4271 static int
4272 ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op)
4273 {
4274 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4275 	ipoib_mac_t *mcast;
4276 	ib_gid_t mgid;
4277 	ib_qpn_t mcqpn;
4278 	int ret;
4279 
4280 	/*
4281 	 * The incoming multicast address might not be aligned properly
4282 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4283 	 * it to look like one though, to get the offsets of the mc gid,
4284 	 * since we know we are not going to dereference any values with
4285 	 * the ipoib_mac_t pointer.
4286 	 */
4287 	mcast = (ipoib_mac_t *)mcmac;
4288 
4289 	/*
4290 	 * Check validity of MCG address. We could additionally check
4291 	 * that a enable/disable is not being issued on the "broadcast"
4292 	 * mcg, but since this operation is only invokable by priviledged
4293 	 * programs anyway, we allow the flexibility to those dlpi apps.
4294 	 * Note that we do not validate the "scope" of the IBA mcg.
4295 	 */
4296 	bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t));
4297 	if (mcqpn != htonl(IB_MC_QPN))
4298 		return (GLD_FAILURE);
4299 
4300 	/*
4301 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4302 	 * nothing (ie we stay JOINed to the broadcast group done in
4303 	 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
4304 	 * requires to be joined to broadcast groups at all times.
4305 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4306 	 * depends on this.
4307 	 */
4308 	if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr,
4309 	    IPOIB_ADDRL) == 0)
4310 		return (GLD_SUCCESS);
4311 
4312 	ibd_n2h_gid(mcast, &mgid);
4313 
4314 	if (op == GLD_MULTI_ENABLE) {
4315 		DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n",
4316 		    mgid.gid_prefix, mgid.gid_guid);
4317 		ret = GLD_RETRY;
4318 		mutex_enter(&state->id_mc_mutex);
4319 		if (state->id_multi_op == NOTSTARTED) {
4320 			state->id_multi_req.rq_gid = mgid;
4321 			ibd_queue_work_slot(state, &state->id_multi_req,
4322 			    ASYNC_JOIN);
4323 			state->id_multi_op = ONGOING;
4324 			bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL);
4325 		} else if (bcmp(&state->id_multi_addr, mcast,
4326 		    IPOIB_ADDRL) == 0) {
4327 			if (state->id_multi_op != ONGOING) {
4328 				if (state->id_multi_op == COMPLETED)
4329 					ret = GLD_SUCCESS;
4330 				else if (state->id_multi_op == ERRORED)
4331 					ret = GLD_FAILURE;
4332 				if (state->id_multi_queued) {
4333 					state->id_multi_queued = B_FALSE;
4334 					ibd_queue_work_slot(state,
4335 					    &state->id_multi_req, ASYNC_POKE);
4336 				} else {
4337 					state->id_multi_op = NOTSTARTED;
4338 				}
4339 			}
4340 		} else {
4341 			/*
4342 			 * Hmmm, a set was tried on another mcg. We
4343 			 * need to make sure to gld_sched for this
4344 			 * stream to retry once the ongoing one terminates.
4345 			 * The gld_sched out of the async thread on completion
4346 			 * of the mcg join is not enough; because the queued
4347 			 * stream might come in and get a RETRY again because
4348 			 * the mcg join result has still not been reaped by
4349 			 * the originator. If gld_sched ensured that streams
4350 			 * get tried in the order they received RETRYs, things
4351 			 * would be simpler.
4352 			 */
4353 			state->id_multi_queued = B_TRUE;
4354 		}
4355 		mutex_exit(&state->id_mc_mutex);
4356 	} else {
4357 		ibd_mce_t *mce;
4358 		DPRINT(1, "ibd_set_multicast : unset_multicast : "
4359 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4360 		ret = GLD_SUCCESS;
4361 		mutex_enter(&state->id_mc_mutex);
4362 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
4363 		mutex_exit(&state->id_mc_mutex);
4364 		/*
4365 		 * GLD should not have invoked us unless the mcg was
4366 		 * added in the past.
4367 		 */
4368 		ASSERT(mce != NULL);
4369 		ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0);
4370 		ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE);
4371 	}
4372 	return (ret);
4373 }
4374 
4375 /*
4376  * The blocking part of the IBA promiscuous operations are done
4377  * out of here on the async thread. The dlpireq parameter indicates
4378  * whether this invocation is due to a dlpi request or due to
4379  * a port up/down event.
4380  */
4381 static void
4382 ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq)
4383 {
4384 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4385 	ib_gid_t mgid;
4386 
4387 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4388 
4389 	/*
4390 	 * Mark the request slot as empty and reusable for the
4391 	 * next promiscuous set request.
4392 	 */
4393 	if (dlpireq)
4394 		state->id_prom_op = NOTSTARTED;
4395 
4396 	while (mce != NULL) {
4397 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4398 		mce = list_next(&state->id_mc_non, mce);
4399 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4400 	}
4401 }
4402 
4403 /*
4404  * The blocking part of the IBA promiscuous operations are done
4405  * out of here on the async thread. The dlpireq parameter indicates
4406  * whether this invocation is due to a dlpi request or due to
4407  * a port up/down event.
4408  */
4409 static void
4410 ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq)
4411 {
4412 	ibt_mcg_attr_t mcg_attr;
4413 	ibt_mcg_info_t *mcg_info;
4414 	ib_gid_t mgid;
4415 	uint_t numg;
4416 	int i;
4417 
4418 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4419 
4420 	/*
4421 	 * Obtain all active MC groups on the IB fabric with
4422 	 * specified criteria (scope + Pkey + Qkey + mtu).
4423 	 */
4424 	bzero(&mcg_attr, sizeof (mcg_attr));
4425 	mcg_attr.mc_pkey = state->id_pkey;
4426 	mcg_attr.mc_scope = state->id_scope;
4427 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4428 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4429 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4430 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4431 	    IBT_SUCCESS) {
4432 		ibd_print_warn(state, "Could not get list of IBA multicast "
4433 		    "groups");
4434 		if (dlpireq)
4435 			state->id_prom_op = ERRORED;
4436 		return;
4437 	}
4438 
4439 	/*
4440 	 * Iterate over the returned mcg's and join as NonMember
4441 	 * to the IP mcg's.
4442 	 */
4443 	for (i = 0; i < numg; i++) {
4444 		/*
4445 		 * Do a NonMember JOIN on the MC group.
4446 		 */
4447 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4448 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4449 			ibd_print_warn(state, "IBA promiscuous mode missed "
4450 			    "multicast gid %016llx:%016llx",
4451 			    (u_longlong_t)mgid.gid_prefix,
4452 			    (u_longlong_t)mgid.gid_guid);
4453 	}
4454 
4455 	ibt_free_mcg_info(mcg_info, numg);
4456 	if (dlpireq)
4457 		state->id_prom_op = COMPLETED;
4458 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4459 }
4460 
4461 /*
4462  * GLD entry point for multicast promiscuous enable/disable requests.
4463  * GLD assumes phys state receives more packets than multi state,
4464  * which is not true for IPoIB. Thus, treat the multi and phys
4465  * promiscuous states the same way to work with GLD's assumption.
4466  */
4467 static int
4468 ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode)
4469 {
4470 	ibd_state_t *state;
4471 	int ret;
4472 
4473 	state = (ibd_state_t *)macinfo->gldm_private;
4474 	switch (mode) {
4475 		case GLD_MAC_PROMISC_PHYS:
4476 		case GLD_MAC_PROMISC_MULTI:
4477 			DPRINT(1, "ibd_set_promiscuous : set_promisc : %d",
4478 			    mode);
4479 			/*
4480 			 * Look at gld: this might be getting
4481 			 * called because someone is turning off
4482 			 * prom_phys. Nothing needs to be done in
4483 			 * that case.
4484 			 */
4485 			ret = GLD_RETRY;
4486 			mutex_enter(&state->id_mc_mutex);
4487 			switch (state->id_prom_op) {
4488 				case NOTSTARTED:
4489 					ibd_queue_work_slot(state,
4490 					    &state->id_prom_req, ASYNC_PROMON);
4491 					state->id_prom_op = ONGOING;
4492 					break;
4493 				case COMPLETED:
4494 					ret = GLD_SUCCESS;
4495 					break;
4496 				case ERRORED:
4497 					state->id_prom_op = NOTSTARTED;
4498 					ret = GLD_FAILURE;
4499 			}
4500 			/*
4501 			 * Else in the ONGOING case, nothing special
4502 			 * needs to be done; the async thread will poke
4503 			 * all streams. A prior set, or the last unset
4504 			 * request is still in the async queue.
4505 			 */
4506 			mutex_exit(&state->id_mc_mutex);
4507 			return (ret);
4508 		case GLD_MAC_PROMISC_NONE:
4509 			DPRINT(1, "ibd_set_promiscuous : unset_promisc");
4510 			/*
4511 			 * Look at gld: this might be getting
4512 			 * called because someone is turning off
4513 			 * prom_phys or prom_multi. Mark operation
4514 			 * as ongoing, to prevent a subsequent set
4515 			 * operation from using the request slot
4516 			 * unless the async thread is ready to give
4517 			 * it up. The async thread will mark the
4518 			 * request slot as usable as soon as it
4519 			 * starts doing the unset operation.
4520 			 */
4521 			ASSERT(state->id_prom_op == COMPLETED);
4522 			state->id_prom_op = ONGOING;
4523 			ibd_queue_work_slot(state, &state->id_prom_req,
4524 			    ASYNC_PROMOFF);
4525 			return (GLD_SUCCESS);
4526 		default:
4527 			return (GLD_NOTSUPPORTED);
4528 	}
4529 }
4530 
4531 /*
4532  * GLD entry point for gathering statistics.
4533  */
4534 static int
4535 ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp)
4536 {
4537 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4538 
4539 	sp->glds_errrcv = 0;
4540 	sp->glds_underflow = 0;
4541 	sp->glds_missed = 0;
4542 
4543 	sp->glds_overflow = state->id_tx_short;	/* Tx overflow */
4544 	sp->glds_speed = state->id_link_speed;
4545 	sp->glds_media = GLDM_IB;
4546 	sp->glds_errxmt = state->id_ah_error;	/* failed AH translation */
4547 	sp->glds_norcvbuf = state->id_rx_short;	/* # times below water mark */
4548 	sp->glds_intr = state->id_num_intrs;	/* number of intrs */
4549 
4550 	return (GLD_SUCCESS);
4551 }
4552 
4553 /*
4554  * Arrange for a Tx request that is failing, or has already failed due to
4555  * Tx descriptor shortage to be retried soon. Used mostly with poll based
4556  * Tx completion, since gld_sched() can not be invoked in ibd_send() context
4557  * due to potential single processor deadlock (when the ibd_send() is
4558  * caused by gld_recv()).
4559  */
4560 static void
4561 ibd_tx_sched(ibd_state_t *state)
4562 {
4563 	mutex_enter(&state->id_sched_lock);
4564 	/*
4565 	 * If a sched request is already enqueued, do not try to do
4566 	 * that again, since the async work request list would get
4567 	 * corrupted.
4568 	 */
4569 	if (!state->id_sched_queued) {
4570 		state->id_sched_queued = B_TRUE;
4571 		ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED);
4572 	}
4573 	mutex_exit(&state->id_sched_lock);
4574 }
4575 
4576 /*
4577  * The gld_sched() in ibd_async_work() does the work for us.
4578  */
4579 static void
4580 ibd_async_txsched(ibd_state_t *state)
4581 {
4582 	mutex_enter(&state->id_sched_lock);
4583 	state->id_sched_queued = B_FALSE;
4584 	mutex_exit(&state->id_sched_lock);
4585 }
4586 
4587 /*
4588  * Release one or more chained send wqes back into free list.
4589  */
4590 static void
4591 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe,
4592     boolean_t send_context)
4593 {
4594 	boolean_t call_gld_sched = B_FALSE;
4595 
4596 	/*
4597 	 * Add back on Tx list for reuse.
4598 	 */
4599 	lswqe->swqe_next = NULL;
4600 	mutex_enter(&state->id_tx_list.dl_mutex);
4601 	if (state->id_tx_list.dl_pending_sends) {
4602 		state->id_tx_list.dl_pending_sends = B_FALSE;
4603 		call_gld_sched = B_TRUE;
4604 	}
4605 	if (state->id_tx_list.dl_head == NULL) {
4606 		state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe);
4607 	} else {
4608 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe);
4609 	}
4610 	state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe);
4611 	mutex_exit(&state->id_tx_list.dl_mutex);
4612 
4613 	/*
4614 	 * See comments in ibd_tx_sched(); make sure not to call
4615 	 * gld_sched() if we are in ibd_send() context.
4616 	 */
4617 	if (call_gld_sched)
4618 		if ((ibd_txcomp_poll == 0) && (!send_context))
4619 			gld_sched(state->id_macinfo);
4620 		else
4621 			ibd_tx_sched(state);
4622 }
4623 
4624 /*
4625  * Acquire a number of chained send wqe's from the free list. Returns the
4626  * number of wqe's actually allocated, and pointers to the first and last
4627  * in the chain.
4628  */
4629 static int
4630 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe,
4631     int number)
4632 {
4633 	int numwqe = number;
4634 	ibd_swqe_t *node, *wqes;
4635 
4636 	/*
4637 	 * Check and reclaim some of the completed Tx requests.
4638 	 * If someone else is already in this code and pulling Tx
4639 	 * completions, no need to poll, since the current lock holder
4640 	 * will do the work anyway. Normally, we poll for completions
4641 	 * every few Tx attempts, but if we are short on Tx descriptors,
4642 	 * we always try to poll.
4643 	 */
4644 	if ((ibd_txcomp_poll == 1) &&
4645 	    (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) ==
4646 	    0) || state->id_tx_list.dl_pending_sends) &&
4647 	    (mutex_tryenter(&state->id_txcomp_lock) != 0)) {
4648 		DPRINT(10, "ibd_send : polling");
4649 		ibd_poll_compq(state, state->id_scq_hdl);
4650 		mutex_exit(&state->id_txcomp_lock);
4651 	}
4652 
4653 	/*
4654 	 * Grab required transmit wqes.
4655 	 */
4656 	mutex_enter(&state->id_tx_list.dl_mutex);
4657 	node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head);
4658 	while ((node != NULL) && (numwqe-- > 1))
4659 		node = WQE_TO_SWQE(node->swqe_next);
4660 
4661 	/*
4662 	 * If we did not find the number we were looking for, flag no resource.
4663 	 * Adjust list appropriately in either case.
4664 	 */
4665 	if (numwqe != 0) {
4666 		state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL;
4667 		state->id_tx_list.dl_pending_sends = B_TRUE;
4668 		mutex_exit(&state->id_tx_list.dl_mutex);
4669 		DPRINT(5, "ibd_acquire_swqes: out of Tx wqe");
4670 		atomic_add_64(&state->id_tx_short, 1);
4671 		if (ibd_txcomp_poll == 1) {
4672 			/*
4673 			 * Arrange for a future gld_sched(). Note that when
4674 			 * the Tx is retried after a little bit, it will
4675 			 * surely poll the completion queue above.
4676 			 */
4677 			ibd_tx_sched(state);
4678 		}
4679 	} else {
4680 		state->id_tx_list.dl_head = node->swqe_next;
4681 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node))
4682 			state->id_tx_list.dl_tail = NULL;
4683 		mutex_exit(&state->id_tx_list.dl_mutex);
4684 	}
4685 
4686 	/*
4687 	 * Set return parameters.
4688 	 */
4689 	*fswqe = wqes;
4690 	*lswqe = node;
4691 	return (number - numwqe);
4692 }
4693 
4694 typedef struct ibd_mpack_s {
4695 	ibd_swqe_t	*ip_swqe;
4696 	uint32_t	ip_start, ip_stuff, ip_flags;
4697 	ibd_ace_t	*ip_ace;
4698 	boolean_t	ip_copy;
4699 	boolean_t	ip_noresources;
4700 	int		ip_segs;
4701 	ibt_mr_hdl_t	ip_mhdl[IBD_MDTMAX_SEGS + 1];
4702 	ibt_mr_desc_t	ip_mdsc[IBD_MDTMAX_SEGS + 1];
4703 } ibd_mpack_t;
4704 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s))
4705 
4706 static void
4707 ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info)
4708 {
4709 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4710 	ibd_mpack_t *ptx = (ibd_mpack_t *)cookie;
4711 	ibd_ace_t *ace = ptx->ip_ace;
4712 	ibd_swqe_t *wqes, *node = ptx->ip_swqe;
4713 	boolean_t docopy = ptx->ip_copy;
4714 	uchar_t *pptr;
4715 	int i, pktsize, seglen, seg = 0;
4716 
4717 	/*
4718 	 * Snag the next wqe before we post this one, since it could complete
4719 	 * very fast and the wqe could get put at the end of the list,
4720 	 * corrupting our chain. Set up for the next packet.
4721 	 */
4722 	wqes = WQE_TO_SWQE(node->swqe_next);
4723 	ptx->ip_swqe = wqes;
4724 
4725 	IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff,
4726 	    ptx->ip_flags);
4727 	node->w_ahandle = ace;
4728 	node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
4729 
4730 	if (docopy) {
4731 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
4732 		pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
4733 		pktsize = seglen = PDESC_HDRL(dl_pkt_info);
4734 		if (seglen > 0) {
4735 			bcopy(dl_pkt_info->hdr_rptr, pptr, seglen);
4736 			pptr += seglen;
4737 		}
4738 		for (; seg < dl_pkt_info->pld_cnt; seg++)
4739 			if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) {
4740 				bcopy(dl_pkt_info->pld_ary[seg].pld_rptr,
4741 				    pptr, seglen);
4742 				pptr += seglen;
4743 				pktsize += seglen;
4744 			}
4745 		node->w_swr.wr_nds = 1;
4746 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
4747 	} else {
4748 		seglen = PDESC_HDRL(dl_pkt_info);
4749 		if (seglen > 0) {
4750 			node->w_smblk_sgl[seg].ds_va =
4751 			    (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr;
4752 			node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey;
4753 			node->w_smblk_sgl[seg].ds_len = seglen;
4754 			seg++;
4755 		}
4756 		for (i = 0; i < dl_pkt_info->pld_cnt; i++) {
4757 			if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) {
4758 				node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t)
4759 				    (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr;
4760 				node->w_smblk_sgl[seg].ds_key =
4761 				    ptx->ip_mdsc[dl_pkt_info->
4762 				    pld_ary[i].pld_pbuf_idx + 1].md_lkey;
4763 				node->w_smblk_sgl[seg].ds_len = seglen;
4764 				seg++;
4765 			}
4766 		}
4767 		node->w_swr.wr_sgl = node->w_smblk_sgl;
4768 		node->w_swr.wr_nds = seg;
4769 	}
4770 
4771 	if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) !=
4772 	    IBT_SUCCESS) {
4773 		/*
4774 		 * We never expect a failure here. But handle it, just in case.
4775 		 * If this is not the last packet, there are no problems; if
4776 		 * it is the last packet and the previous ones have not been
4777 		 * transmitted yet by the hardware, in the registration case,
4778 		 * the hardware might transmit garbage since we will be
4779 		 * freemsg'ing. The AH is still safe.
4780 		 */
4781 		DPRINT(5, "ibd_mdt_txone: posting failed");
4782 		ibd_tx_cleanup(state, node, B_TRUE);
4783 	}
4784 }
4785 
4786 static int
4787 ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie)
4788 {
4789 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4790 	multidata_t *dlmdp = mmd_getmultidata(mp);
4791 	ibd_mpack_t *mdinfo;
4792 	mbufinfo_t bufinfo, *binfo = &bufinfo;
4793 	pattrinfo_t attr_info;
4794 	uchar_t *dlap;
4795 	ibt_mr_attr_t mem_attr;
4796 	ibd_swqe_t *wqes, *node;
4797 	ipoib_mac_t *dest;
4798 	size_t hsize, psize = 0;
4799 	int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL);
4800 	int i, ret;
4801 	uint32_t end, value;
4802 	boolean_t noresources = B_FALSE;
4803 
4804 	ASSERT(DB_TYPE(mp) == M_MULTIDATA);
4805 	ASSERT(mp->b_cont == NULL);
4806 
4807 	if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0)
4808 		return (0);
4809 	else if (numwqes != numpackets)
4810 		noresources = B_TRUE;
4811 
4812 	DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node);
4813 
4814 	/*
4815 	 * Allocate the cookie that will be passed to subsequent packet
4816 	 * transmit and post_mdt calls by GLD. We can not sleep, so if
4817 	 * there is no memory, just tell GLD to drop the entire MDT message.
4818 	 */
4819 	if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) {
4820 		ibd_release_swqes(state, wqes, node, B_TRUE);
4821 		return (-1);
4822 	}
4823 	*cookie = (void *)mdinfo;
4824 	mdinfo->ip_noresources = noresources;
4825 
4826 	/*
4827 	 * Walk Global Attributes. If TCP failed to provide destination
4828 	 * information, or some interposing module removed the information,
4829 	 * fail the entire message.
4830 	 */
4831 	attr_info.type = PATTR_DSTADDRSAP;
4832 	if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) {
4833 		ibd_release_swqes(state, wqes, node, B_TRUE);
4834 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4835 		return (-1);
4836 	}
4837 	dlap = ((pattr_addr_t *)attr_info.buf)->addr;
4838 	dest = (ipoib_mac_t *)dlap;
4839 
4840 	/*
4841 	 * Get the AH for this destination, incrementing the posted
4842 	 * reference count properly.
4843 	 */
4844 	if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret,
4845 	    numwqes)) == NULL) {
4846 		ibd_release_swqes(state, wqes, node, B_TRUE);
4847 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4848 		return ((ret == GLD_FAILURE) ? -1 : 0);
4849 	}
4850 
4851 	/*
4852 	 * Depending on how costly it is to copy vs register, we try to
4853 	 * register, falling back on copying if we fail.
4854 	 */
4855 	mmd_getregions(dlmdp, &bufinfo);
4856 	hsize = binfo->hbuf_wptr - binfo->hbuf_rptr;
4857 	for (i = 0; i < binfo->pbuf_cnt; i++)
4858 		psize += (binfo->pbuf_ary[i].pbuf_wptr -
4859 		    binfo->pbuf_ary[i].pbuf_rptr);
4860 	if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) {
4861 		mdinfo->ip_segs = i + 1;
4862 		if (hsize != 0) {
4863 			mem_attr.mr_as = NULL;
4864 			mem_attr.mr_flags = IBT_MR_NOSLEEP;
4865 			mem_attr.mr_vaddr =
4866 			    (uint64_t)(uintptr_t)binfo->hbuf_rptr;
4867 			mem_attr.mr_len = hsize;
4868 			if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
4869 			    &mem_attr, &mdinfo->ip_mhdl[0],
4870 			    &mdinfo->ip_mdsc[0]) != IBT_SUCCESS)
4871 				goto ibd_mdt_copy;
4872 			DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize);
4873 		}
4874 		for (i = 0; i < binfo->pbuf_cnt; i++) {
4875 			if ((psize = (binfo->pbuf_ary[i].pbuf_wptr -
4876 			    binfo->pbuf_ary[i].pbuf_rptr)) != 0) {
4877 				mem_attr.mr_as = NULL;
4878 				mem_attr.mr_flags = IBT_MR_NOSLEEP;
4879 				mem_attr.mr_vaddr = (uint64_t)(uintptr_t)
4880 				    binfo->pbuf_ary[i].pbuf_rptr;
4881 				mem_attr.mr_len = psize;
4882 				if (ibt_register_mr(state->id_hca_hdl,
4883 				    state->id_pd_hdl, &mem_attr,
4884 				    &mdinfo->ip_mhdl[i + 1],
4885 				    &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) {
4886 					for (; i >= 0; i--) {
4887 						(void) ibt_deregister_mr(
4888 						    state->id_hca_hdl,
4889 						    mdinfo->ip_mhdl[i]);
4890 					}
4891 					goto ibd_mdt_copy;
4892 				}
4893 				DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize);
4894 			}
4895 		}
4896 
4897 		mdinfo->ip_copy = B_FALSE;
4898 
4899 		/*
4900 		 * All the deregistration must happen once the last swqe
4901 		 * completes.
4902 		 */
4903 		node->swqe_im_mblk = mp;
4904 		node->w_mdtinfo = mdinfo;
4905 		DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node);
4906 	} else {
4907 ibd_mdt_copy:
4908 		mdinfo->ip_copy = B_TRUE;
4909 	}
4910 
4911 	/*
4912 	 * Do checksum related work.
4913 	 */
4914 	IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff,
4915 	    &end, &value, &mdinfo->ip_flags);
4916 
4917 	mdinfo->ip_swqe = wqes;
4918 	return (numwqes);
4919 }
4920 
4921 /* ARGSUSED */
4922 static void
4923 ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie)
4924 {
4925 	ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie;
4926 
4927 	if (mdinfo->ip_copy) {
4928 		if (!mdinfo->ip_noresources)
4929 			freemsg(mp);
4930 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4931 	}
4932 }
4933 
4934 /*
4935  * GLD entry point for transmitting a datagram.
4936  * The passed in packet has this format:
4937  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
4938  */
4939 static int
4940 ibd_send(gld_mac_info_t *macinfo, mblk_t *mp)
4941 {
4942 	ibt_status_t ibt_status;
4943 	ibt_mr_attr_t mem_attr;
4944 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4945 	ibd_ace_t *ace;
4946 	ibd_swqe_t *node;
4947 	ipoib_mac_t *dest;
4948 	ipoib_ptxhdr_t *ipibp;
4949 	ip6_t *ip6h;
4950 	mblk_t *nmp = mp;
4951 	uint_t pktsize;
4952 	size_t	blksize;
4953 	uchar_t *bufp;
4954 	int i, ret, len, nmblks = 1;
4955 	boolean_t dofree = B_TRUE;
4956 
4957 	if (ibd_acquire_swqes(state, &node, &node, 1) == 0)
4958 		return (GLD_NORESOURCES);
4959 
4960 	/*
4961 	 * Obtain an address handle for the destination.
4962 	 */
4963 	dest = (ipoib_mac_t *)mp->b_rptr;
4964 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
4965 		node->w_ahandle = ace;
4966 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
4967 	} else {
4968 		DPRINT(5,
4969 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
4970 		    ((ret == GLD_FAILURE) ? "failed" : "queued"),
4971 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
4972 		    htonl(dest->ipoib_gidpref[1]),
4973 		    htonl(dest->ipoib_gidsuff[0]),
4974 		    htonl(dest->ipoib_gidsuff[1]));
4975 		node->w_ahandle = NULL;
4976 		goto ibd_send_fail;
4977 	}
4978 
4979 	/*
4980 	 * For ND6 packets, padding is at the front of the source lladdr.
4981 	 * Insert the padding at front.
4982 	 */
4983 	ipibp = (ipoib_ptxhdr_t *)mp->b_rptr;
4984 	if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) {
4985 		if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) {
4986 			if (!pullupmsg(mp, IPV6_HDR_LEN +
4987 			    sizeof (ipoib_ptxhdr_t))) {
4988 				DPRINT(10, "ibd_send: pullupmsg failure ");
4989 				ret = GLD_FAILURE;
4990 				goto ibd_send_fail;
4991 			}
4992 		}
4993 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t));
4994 		len = ntohs(ip6h->ip6_plen);
4995 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
4996 			if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) +
4997 			    IPV6_HDR_LEN + len) {
4998 				if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) +
4999 				    IPV6_HDR_LEN + len)) {
5000 					DPRINT(10, "ibd_send: pullupmsg "
5001 					    "failure ");
5002 					ret = GLD_FAILURE;
5003 					goto ibd_send_fail;
5004 				}
5005 			}
5006 			/* LINTED: E_CONSTANT_CONDITION */
5007 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5008 		}
5009 	}
5010 
5011 	mp->b_rptr += IPOIB_ADDRL;
5012 	while (((nmp = nmp->b_cont) != NULL) &&
5013 	    (++nmblks < (state->id_max_sqseg + 1)))
5014 		;
5015 	pktsize = msgsize(mp);
5016 	if (pktsize > state->id_mtu) {
5017 		ret = GLD_BADARG;
5018 		goto ibd_send_fail;
5019 	}
5020 
5021 	/*
5022 	 * Do checksum related work.
5023 	 */
5024 	IBD_CKSUM_SEND(mp);
5025 
5026 	/*
5027 	 * Copy the data to preregistered buffers, or register the buffer.
5028 	 */
5029 	if ((nmblks <= state->id_max_sqseg) &&
5030 	    (pktsize > IBD_TX_COPY_THRESHOLD)) {
5031 		for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) {
5032 			mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr;
5033 			mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr;
5034 			mem_attr.mr_as = NULL;
5035 			mem_attr.mr_flags = IBT_MR_NOSLEEP;
5036 			ibt_status = ibt_register_mr(state->id_hca_hdl,
5037 			    state->id_pd_hdl, &mem_attr,
5038 			    &node->w_smblkbuf[i].im_mr_hdl,
5039 			    &node->w_smblkbuf[i].im_mr_desc);
5040 			if (ibt_status != IBT_SUCCESS) {
5041 				/*
5042 				 * We do not expect any error other than
5043 				 * IBT_INSUFF_RESOURCE.
5044 				 */
5045 				if (ibt_status != IBT_INSUFF_RESOURCE)
5046 					DPRINT(10, "ibd_send:%d\n",
5047 					    "failed in ibt_register_mem()",
5048 					    ibt_status);
5049 				DPRINT(5, "ibd_send: registration failed");
5050 				node->w_swr.wr_nds = i;
5051 				/*
5052 				 * Deregister already registered memory;
5053 				 * fallback to copying the mblk.
5054 				 */
5055 				ibd_deregister_mr(state, node);
5056 				goto ibd_copy_path;
5057 			}
5058 			node->w_smblk_sgl[i].ds_va =
5059 			    (ib_vaddr_t)(uintptr_t)nmp->b_rptr;
5060 			node->w_smblk_sgl[i].ds_key =
5061 			    node->w_smblkbuf[i].im_mr_desc.md_lkey;
5062 			node->w_smblk_sgl[i].ds_len =
5063 			    nmp->b_wptr - nmp->b_rptr;
5064 		}
5065 		node->swqe_im_mblk = mp;
5066 		node->w_swr.wr_sgl = node->w_smblk_sgl;
5067 		node->w_swr.wr_nds = nmblks;
5068 		dofree = B_FALSE;
5069 	} else {
5070 ibd_copy_path:
5071 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5072 		node->w_swr.wr_nds = 1;
5073 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5074 
5075 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5076 		for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
5077 			blksize = MBLKL(nmp);
5078 			bcopy(nmp->b_rptr, bufp, blksize);
5079 			bufp += blksize;
5080 		}
5081 	}
5082 
5083 	/*
5084 	 * Queue the wqe to hardware.
5085 	 */
5086 	ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL);
5087 	if (ibt_status != IBT_SUCCESS) {
5088 		/*
5089 		 * We should not fail here; but just in case we do, we
5090 		 * tell GLD about this error.
5091 		 */
5092 		ret = GLD_FAILURE;
5093 		DPRINT(5, "ibd_send: posting failed");
5094 		goto ibd_send_fail;
5095 	}
5096 
5097 	DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X",
5098 	    INCTXPACK, htonl(ace->ac_mac.ipoib_qpn),
5099 	    htonl(ace->ac_mac.ipoib_gidpref[0]),
5100 	    htonl(ace->ac_mac.ipoib_gidpref[1]),
5101 	    htonl(ace->ac_mac.ipoib_gidsuff[0]),
5102 	    htonl(ace->ac_mac.ipoib_gidsuff[1]));
5103 
5104 	if (dofree)
5105 		freemsg(mp);
5106 
5107 	return (GLD_SUCCESS);
5108 
5109 ibd_send_fail:
5110 	ibd_tx_cleanup(state, node, B_TRUE);
5111 	return (ret);
5112 }
5113 
5114 /*
5115  * GLD entry point for handling interrupts. When using combined CQ,
5116  * this handles Tx and Rx completions. With separate CQs, this handles
5117  * only Rx completions.
5118  */
5119 static uint_t
5120 ibd_intr(gld_mac_info_t *macinfo)
5121 {
5122 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
5123 
5124 	/*
5125 	 * Poll for completed entries; the CQ will not interrupt any
5126 	 * more for incoming (or transmitted) packets.
5127 	 */
5128 	ibd_poll_compq(state, state->id_rcq_hdl);
5129 
5130 	/*
5131 	 * Now enable CQ notifications; all packets that arrive now
5132 	 * (or complete transmission) will cause new interrupts.
5133 	 */
5134 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
5135 	    IBT_SUCCESS) {
5136 		/*
5137 		 * We do not expect a failure here.
5138 		 */
5139 		DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5140 	}
5141 
5142 	/*
5143 	 * Repoll to catch all packets that might have arrived after
5144 	 * we finished the first poll loop and before interrupts got
5145 	 * armed.
5146 	 */
5147 	ibd_poll_compq(state, state->id_rcq_hdl);
5148 
5149 	return (DDI_INTR_CLAIMED);
5150 }
5151 
5152 /*
5153  * Common code for interrupt handling as well as for polling
5154  * for all completed wqe's while detaching.
5155  */
5156 static void
5157 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5158 {
5159 	ibd_wqe_t *wqe;
5160 	ibt_wc_t *wc, *wcs;
5161 	uint_t numwcs;
5162 	int i;
5163 
5164 	/*
5165 	 * In some cases (eg detaching), this code can be invoked on
5166 	 * any cpu after disabling cq notification (thus no concurrency
5167 	 * exists). Apart from that, the following applies normally:
5168 	 * The receive completion handling is always on the Rx interrupt
5169 	 * cpu. Transmit completion handling could be from any cpu if
5170 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5171 	 * is interrupt driven. Combined completion handling is always
5172 	 * on the interrupt cpu. Thus, lock accordingly and use the
5173 	 * proper completion array.
5174 	 */
5175 	if (cq_hdl == state->id_rcq_hdl)
5176 		wcs = state->id_wcs;
5177 	else
5178 		wcs = state->id_txwcs;
5179 
5180 	while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) {
5181 
5182 		for (i = 0, wc = wcs; i < numwcs; i++, wc++) {
5183 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5184 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5185 			    (wqe->w_type == IBD_WQE_RECV));
5186 			if (wc->wc_status != IBT_WC_SUCCESS) {
5187 				/*
5188 				 * Channel being torn down.
5189 				 */
5190 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5191 					DPRINT(5, "ibd_intr: flush error");
5192 					/*
5193 					 * Only invoke the Tx handler to
5194 					 * release possibly held resources
5195 					 * like AH refcount etc. Can not
5196 					 * invoke Rx handler because it might
5197 					 * try adding buffers to the Rx pool
5198 					 * when we are trying to deinitialize.
5199 					 */
5200 					if (wqe->w_type == IBD_WQE_RECV)
5201 						continue;
5202 				} else {
5203 					DPRINT(10, "%s %d",
5204 					    "ibd_intr: Bad CQ status",
5205 					    wc->wc_status);
5206 				}
5207 			}
5208 			if (wqe->w_type == IBD_WQE_SEND)
5209 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe),
5210 				    B_FALSE);
5211 			else
5212 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5213 		}
5214 	}
5215 }
5216 
5217 /*
5218  * Deregister the mr associated with a given mblk.
5219  */
5220 static void
5221 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe)
5222 {
5223 	int i;
5224 
5225 	DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe,
5226 	    swqe->w_swr.wr_nds);
5227 	/*
5228 	 * If this is an MDT case, process accordingly.
5229 	 */
5230 	if (swqe->w_mdtinfo != NULL) {
5231 		ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo;
5232 
5233 		for (i = 0; i < mdinfo->ip_segs; i++)
5234 			if ((mdinfo->ip_mhdl[i] != 0) &&
5235 			    (ibt_deregister_mr(state->id_hca_hdl,
5236 			    mdinfo->ip_mhdl[i]) != IBT_SUCCESS))
5237 				DPRINT(10, "MDT deregistration failed\n");
5238 		ASSERT(!mdinfo->ip_copy);
5239 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
5240 		swqe->w_mdtinfo = NULL;
5241 		return;
5242 	}
5243 
5244 	for (i = 0; i < swqe->w_swr.wr_nds; i++) {
5245 		if (ibt_deregister_mr(state->id_hca_hdl,
5246 		    swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) {
5247 			/*
5248 			 * We do not expect any errors here.
5249 			 */
5250 			DPRINT(10, "failed in ibt_deregister_mem()\n");
5251 		}
5252 	}
5253 }
5254 
5255 /*
5256  * Common code that deals with clean ups after a successful or
5257  * erroneous transmission attempt.
5258  */
5259 static void
5260 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context)
5261 {
5262 	ibd_ace_t *ace = swqe->w_ahandle;
5263 
5264 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
5265 
5266 	/*
5267 	 * If this was a dynamic registration in ibd_send() or in MDT,
5268 	 * deregister now.
5269 	 */
5270 	if (swqe->swqe_im_mblk != NULL) {
5271 		ibd_deregister_mr(state, swqe);
5272 		freemsg(swqe->swqe_im_mblk);
5273 		swqe->swqe_im_mblk = NULL;
5274 	}
5275 
5276 	/*
5277 	 * Drop the reference count on the AH; it can be reused
5278 	 * now for a different destination if there are no more
5279 	 * posted sends that will use it. This can be eliminated
5280 	 * if we can always associate each Tx buffer with an AH.
5281 	 * The ace can be null if we are cleaning up from the
5282 	 * ibd_send() error path.
5283 	 */
5284 	if (ace != NULL) {
5285 		/*
5286 		 * The recycling logic can be eliminated from here
5287 		 * and put into the async thread if we create another
5288 		 * list to hold ACE's for unjoined mcg's.
5289 		 */
5290 		if (DEC_REF_DO_CYCLE(ace)) {
5291 			ibd_mce_t *mce;
5292 
5293 			/*
5294 			 * Check with the lock taken: we decremented
5295 			 * reference count without the lock, and some
5296 			 * transmitter might alreay have bumped the
5297 			 * reference count (possible in case of multicast
5298 			 * disable when we leave the AH on the active
5299 			 * list). If not still 0, get out, leaving the
5300 			 * recycle bit intact.
5301 			 *
5302 			 * Atomically transition the AH from active
5303 			 * to free list, and queue a work request to
5304 			 * leave the group and destroy the mce. No
5305 			 * transmitter can be looking at the AH or
5306 			 * the MCE in between, since we have the
5307 			 * ac_mutex lock. In the SendOnly reap case,
5308 			 * it is not neccesary to hold the ac_mutex
5309 			 * and recheck the ref count (since the AH was
5310 			 * taken off the active list), we just do it
5311 			 * to have uniform processing with the Full
5312 			 * reap case.
5313 			 */
5314 			mutex_enter(&state->id_ac_mutex);
5315 			mce = ace->ac_mce;
5316 			if (GET_REF_CYCLE(ace) == 0) {
5317 				CLEAR_REFCYCLE(ace);
5318 				/*
5319 				 * Identify the case of fullmember reap as
5320 				 * opposed to mcg trap reap. Also, port up
5321 				 * might set ac_mce to NULL to indicate Tx
5322 				 * cleanup should do no more than put the
5323 				 * AH in the free list (see ibd_async_link).
5324 				 */
5325 				if (mce != NULL) {
5326 					ace->ac_mce = NULL;
5327 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
5328 					/*
5329 					 * mc_req was initialized at mce
5330 					 * creation time.
5331 					 */
5332 					ibd_queue_work_slot(state,
5333 					    &mce->mc_req, ASYNC_REAP);
5334 				}
5335 				IBD_ACACHE_INSERT_FREE(state, ace);
5336 			}
5337 			mutex_exit(&state->id_ac_mutex);
5338 		}
5339 	}
5340 
5341 	/*
5342 	 * Release the send wqe for reuse.
5343 	 */
5344 	ibd_release_swqes(state, swqe, swqe, send_context);
5345 }
5346 
5347 /*
5348  * Processing to be done after receipt of a packet; hand off to GLD
5349  * in the format expected by GLD.
5350  * The recvd packet has this format: 2b sap :: 00 :: data.
5351  */
5352 static void
5353 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
5354 {
5355 	ipoib_pgrh_t *pgrh;
5356 	mblk_t *mp;
5357 	ipoib_hdr_t *ipibp;
5358 	ip6_t *ip6h;
5359 	int rxcnt, len;
5360 
5361 	/*
5362 	 * Track number handed to upper layer, and number still
5363 	 * available to receive packets.
5364 	 */
5365 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
5366 	ASSERT(rxcnt >= 0);
5367 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
5368 
5369 	/*
5370 	 * Adjust write pointer depending on how much data came in.
5371 	 */
5372 	mp = rwqe->rwqe_im_mblk;
5373 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
5374 
5375 	/*
5376 	 * If the GRH is not valid, indicate to GLD by setting
5377 	 * the VerTcFlow field to 0. Else, update the pseudoGRH
5378 	 * so that GLD can determine the source mac of the packet.
5379 	 */
5380 	pgrh = (ipoib_pgrh_t *)mp->b_rptr;
5381 	if (wc->wc_flags & IBT_WC_GRH_PRESENT)
5382 		pgrh->ipoib_sqpn = htonl(wc->wc_qpn);
5383 	else
5384 		pgrh->ipoib_vertcflow = 0;
5385 
5386 	DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK);
5387 
5388 	/*
5389 	 * For ND6 packets, padding is at the front of the source/target
5390 	 * lladdr. However the inet6 layer is not aware of it, hence remove
5391 	 * the padding from such packets.
5392 	 */
5393 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
5394 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
5395 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
5396 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5397 			    sizeof (ipoib_hdr_t))) {
5398 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
5399 				freemsg(mp);
5400 				return;
5401 			}
5402 		}
5403 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
5404 		len = ntohs(ip6h->ip6_plen);
5405 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5406 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
5407 			    IPV6_HDR_LEN + len) {
5408 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
5409 				    IPV6_HDR_LEN + len)) {
5410 					DPRINT(10, "ibd_process_rx: pullupmsg"
5411 					    " failed");
5412 					freemsg(mp);
5413 					return;
5414 				}
5415 			}
5416 			/* LINTED: E_CONSTANT_CONDITION */
5417 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
5418 		}
5419 	}
5420 
5421 	/*
5422 	 * Hand off to service thread/GLD. When we have hardware that
5423 	 * does hardware checksum, we will pull the checksum from the
5424 	 * work completion structure here.
5425 	 * on interrupt cpu.
5426 	 */
5427 	ibd_send_up(state, mp);
5428 
5429 	/*
5430 	 * Possibly replenish the Rx pool if needed.
5431 	 */
5432 	if (rxcnt < IBD_RX_THRESHOLD) {
5433 		state->id_rx_short++;
5434 		if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) {
5435 			if (ibd_post_rwqe(state, rwqe, B_FALSE) ==
5436 			    DDI_FAILURE) {
5437 				ibd_free_rwqe(state, rwqe);
5438 				return;
5439 			}
5440 		}
5441 	}
5442 }
5443 
5444 /*
5445  * Callback code invoked from STREAMs when the recv data buffer is free
5446  * for recycling.
5447  */
5448 static void
5449 ibd_freemsg_cb(char *arg)
5450 {
5451 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
5452 	ibd_state_t *state = rwqe->w_state;
5453 
5454 	/*
5455 	 * If the wqe is being destructed, do not attempt recycling.
5456 	 */
5457 	if (rwqe->w_freeing_wqe == B_TRUE) {
5458 		DPRINT(6, "ibd_freemsg_cb: wqe being freed");
5459 		return;
5460 	}
5461 
5462 	/*
5463 	 * Upper layer has released held mblk.
5464 	 */
5465 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
5466 
5467 	if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) {
5468 		/*
5469 		 * There are already enough buffers on the Rx ring.
5470 		 * Free this one up.
5471 		 */
5472 		rwqe->rwqe_im_mblk = NULL;
5473 		ibd_delete_rwqe(state, rwqe);
5474 		ibd_free_rwqe(state, rwqe);
5475 		DPRINT(6, "ibd_freemsg_cb: free up wqe");
5476 	} else {
5477 		rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
5478 		    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
5479 		if (rwqe->rwqe_im_mblk == NULL) {
5480 			ibd_delete_rwqe(state, rwqe);
5481 			ibd_free_rwqe(state, rwqe);
5482 			DPRINT(6, "ibd_freemsg_cb: desballoc failed");
5483 			return;
5484 		}
5485 
5486 		/*
5487 		 * Post back to h/w. We could actually have more than
5488 		 * id_num_rwqe WQEs on the list if there were multiple
5489 		 * ibd_freemsg_cb() calls outstanding (since the lock is
5490 		 * not held the entire time). This will start getting
5491 		 * corrected over subsequent ibd_freemsg_cb() calls.
5492 		 */
5493 		if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) {
5494 			ibd_delete_rwqe(state, rwqe);
5495 			ibd_free_rwqe(state, rwqe);
5496 			return;
5497 		}
5498 	}
5499 }
5500 
5501 #ifdef RUN_PERFORMANCE
5502 
5503 /*
5504  * To run the performance test, first do the "ifconfig ibdN plumb" on
5505  * the Rx and Tx side. Then use mdb -kw to tweak the following variables:
5506  * ibd_performance=1.
5507  * ibd_receiver=1 on Rx side.
5508  * ibd_sender=1 on Tx side.
5509  * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update
5510  * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will
5511  * make it drop into a 1 minute loop waiting for packets. An
5512  * ifconfig/unplumb on the Tx will cause it to send packets to Rx.
5513  */
5514 
5515 #define	IBD_NUM_UNSIGNAL	ibd_num_unsignal
5516 #define	IBD_TX_PKTSIZE		ibd_tx_pktsize
5517 #define	IBD_TX_DATASIZE		ibd_tx_datasize
5518 
5519 static ibd_swqe_t **swqes;
5520 static ibt_wc_t *wcs;
5521 
5522 /*
5523  * Set these on Rx and Tx side to do performance run.
5524  */
5525 static int ibd_performance = 0;
5526 static int ibd_receiver = 0;
5527 static int ibd_sender = 0;
5528 static ipoib_mac_t ibd_dest;
5529 
5530 /*
5531  * Interrupt coalescing is achieved by asking for a completion intr
5532  * only every ibd_num_unsignal'th packet.
5533  */
5534 static int ibd_num_unsignal = 8;
5535 
5536 /*
5537  * How big is each packet?
5538  */
5539 static int ibd_tx_pktsize = 2048;
5540 
5541 /*
5542  * Total data size to be transmitted.
5543  */
5544 static int ibd_tx_datasize = 512*1024*1024;
5545 
5546 static volatile boolean_t cq_handler_ran = B_FALSE;
5547 static volatile int num_completions;
5548 
5549 /* ARGSUSED */
5550 static void
5551 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg)
5552 {
5553 	ibd_state_t *state = (ibd_state_t *)arg;
5554 	ibt_cq_hdl_t cqhdl;
5555 	ibd_wqe_t *wqe;
5556 	uint_t polled, i;
5557 	boolean_t cq_enabled = B_FALSE;
5558 
5559 	if (ibd_receiver == 1)
5560 		cqhdl = state->id_rcq_hdl;
5561 	else
5562 		cqhdl = state->id_scq_hdl;
5563 
5564 	/*
5565 	 * Mark the handler as having run and possibly freed up some
5566 	 * slots. Blocked sends can be retried.
5567 	 */
5568 	cq_handler_ran = B_TRUE;
5569 
5570 repoll:
5571 	while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) ==
5572 	    IBT_SUCCESS) {
5573 		num_completions += polled;
5574 		if (ibd_receiver == 1) {
5575 			/*
5576 			 * We can immediately recycle the buffer. No
5577 			 * need to pass up to any IP layer ...
5578 			 */
5579 			for (i = 0; i < polled; i++) {
5580 				wqe = (ibd_wqe_t *)wcs[i].wc_id;
5581 				(void) ibt_post_recv(state->id_chnl_hdl,
5582 				    &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL);
5583 			}
5584 		}
5585 	}
5586 
5587 	/*
5588 	 * If we just repolled, we are done; exit.
5589 	 */
5590 	if (cq_enabled)
5591 		return;
5592 
5593 	/*
5594 	 * Enable CQ.
5595 	 */
5596 	if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
5597 		/*
5598 		 * We do not expect a failure here.
5599 		 */
5600 		cmn_err(CE_CONT, "ibd_perf_handler: notify failed");
5601 	}
5602 	cq_enabled = B_TRUE;
5603 
5604 	/*
5605 	 * Repoll for packets that came in after we finished previous
5606 	 * poll loop but before we turned on notifications.
5607 	 */
5608 	goto repoll;
5609 }
5610 
5611 static void
5612 ibd_perf_tx(ibd_state_t *state)
5613 {
5614 	ibt_mr_hdl_t mrhdl;
5615 	ibt_mr_desc_t mrdesc;
5616 	ibt_mr_attr_t mem_attr;
5617 	ibt_status_t stat;
5618 	ibd_ace_t *ace = NULL;
5619 	ibd_swqe_t *node;
5620 	uchar_t *sendbuf;
5621 	longlong_t stime, etime;
5622 	longlong_t sspin, espin, tspin = 0;
5623 	int i, reps, packets;
5624 
5625 	cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X",
5626 	    htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]),
5627 	    htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]),
5628 	    htonl(ibd_dest.ipoib_gidsuff[1]));
5629 	if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) ||
5630 	    (ibd_dest.ipoib_gidpref[1] == 0)) {
5631 		cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address");
5632 		return;
5633 	}
5634 
5635 	packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE);
5636 	reps = (packets / IBD_NUM_SWQE);
5637 
5638 	cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE);
5639 	cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE);
5640 	cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets);
5641 	cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE);
5642 	cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL);
5643 	if ((packets % IBD_NUM_UNSIGNAL) != 0) {
5644 		/*
5645 		 * This is required to ensure the last packet will trigger
5646 		 * a CQ handler callback, thus we can spin waiting fot all
5647 		 * packets to be received.
5648 		 */
5649 		cmn_err(CE_CONT,
5650 		    "ibd_perf_tx: #Packets not multiple of Signal Grp size");
5651 		return;
5652 	}
5653 	num_completions = 0;
5654 
5655 	swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE,
5656 	    KM_NOSLEEP);
5657 	if (swqes == NULL) {
5658 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5659 		return;
5660 	}
5661 
5662 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5663 	if (wcs == NULL) {
5664 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5665 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5666 		return;
5667 	}
5668 
5669 	/*
5670 	 * Get the ud_dest for the destination.
5671 	 */
5672 	ibd_async_acache(state, &ibd_dest);
5673 	mutex_enter(&state->id_ac_mutex);
5674 	ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0);
5675 	mutex_exit(&state->id_ac_mutex);
5676 	if (ace == NULL) {
5677 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5678 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5679 		cmn_err(CE_CONT, "ibd_perf_tx: no AH");
5680 		return;
5681 	}
5682 
5683 	/*
5684 	 * Set up the send buffer.
5685 	 */
5686 	sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP);
5687 	if (sendbuf == NULL) {
5688 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5689 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5690 		cmn_err(CE_CONT, "ibd_perf_tx: no send buffer");
5691 		return;
5692 	}
5693 
5694 	/*
5695 	 * This buffer can be used in the case when we want to
5696 	 * send data from the same memory area over and over;
5697 	 * it might help in reducing memory traffic.
5698 	 */
5699 	mem_attr.mr_vaddr = (uint64_t)sendbuf;
5700 	mem_attr.mr_len = IBD_TX_PKTSIZE;
5701 	mem_attr.mr_as = NULL;
5702 	mem_attr.mr_flags = IBT_MR_NOSLEEP;
5703 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
5704 	    &mrhdl, &mrdesc) != IBT_SUCCESS) {
5705 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5706 		kmem_free(sendbuf, IBD_TX_PKTSIZE);
5707 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5708 		cmn_err(CE_CONT, "ibd_perf_tx: registration failed");
5709 		return;
5710 	}
5711 
5712 	/*
5713 	 * Allocate private send wqe's.
5714 	 */
5715 	for (i = 0; i < IBD_NUM_SWQE; i++) {
5716 		if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) {
5717 			kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5718 			kmem_free(sendbuf, IBD_TX_PKTSIZE);
5719 			kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5720 			cmn_err(CE_CONT, "ibd_alloc_swqe failure");
5721 			return;
5722 		}
5723 		node->w_ahandle = ace;
5724 #if 0
5725 		node->w_smblkbuf[0].im_mr_hdl = mrhdl;
5726 		node->w_smblkbuf[0].im_mr_desc = mrdesc;
5727 		node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf;
5728 		node->w_smblk_sgl[0].ds_key =
5729 		    node->w_smblkbuf[0].im_mr_desc.md_lkey;
5730 		node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE;
5731 		node->w_swr.wr_sgl = node->w_smblk_sgl;
5732 #else
5733 		node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE;
5734 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5735 #endif
5736 
5737 		/*
5738 		 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs
5739 		 * is marked to invoke the CQ handler. That is the only
5740 		 * way we come to know when the send queue can accept more
5741 		 * WRs.
5742 		 */
5743 		if (((i + 1) % IBD_NUM_UNSIGNAL) != 0)
5744 			node->w_swr.wr_flags = IBT_WR_NO_FLAGS;
5745 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5746 		node->w_swr.wr_nds = 1;
5747 
5748 		swqes[i] = node;
5749 	}
5750 
5751 	ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state);
5752 
5753 	/*
5754 	 * Post all the requests. We expect this stream of post's will
5755 	 * not overwhelm the hardware due to periodic completions and
5756 	 * pollings that happen out of ibd_perf_handler.
5757 	 * Post a set of requests, till the channel can accept; after
5758 	 * that, wait for the CQ handler to notify us that there is more
5759 	 * space.
5760 	 */
5761 	stime = gethrtime();
5762 	for (; reps > 0; reps--)
5763 		for (i = 0; i < IBD_NUM_SWQE; i++) {
5764 			node = swqes[i];
5765 retry:
5766 			if ((stat = ibt_post_send(state->id_chnl_hdl,
5767 			    &node->w_swr, 1, NULL)) != IBT_SUCCESS) {
5768 				if (stat == IBT_CHAN_FULL) {
5769 					/*
5770 					 * Spin till the CQ handler runs
5771 					 * and then try again.
5772 					 */
5773 					sspin = gethrtime();
5774 					while (!cq_handler_ran)
5775 						;
5776 					espin = gethrtime();
5777 					tspin += (espin - sspin);
5778 					cq_handler_ran = B_FALSE;
5779 					goto retry;
5780 				}
5781 				cmn_err(CE_CONT, "post failure %d/%d", stat, i);
5782 				goto done;
5783 			}
5784 		}
5785 
5786 done:
5787 	/*
5788 	 * We should really be snapshotting when we get the last
5789 	 * completion.
5790 	 */
5791 	while (num_completions != (packets / IBD_NUM_UNSIGNAL))
5792 		;
5793 	etime = gethrtime();
5794 
5795 	cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d",
5796 	    num_completions);
5797 	cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime));
5798 	cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin);
5799 
5800 	/*
5801 	 * Wait a sec for everything to get over.
5802 	 */
5803 	delay(drv_usectohz(2000000));
5804 
5805 	/*
5806 	 * Reset CQ handler to real one; free resources.
5807 	 */
5808 	if (ibd_separate_cqs == 0) {
5809 		ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state);
5810 	} else {
5811 		if (ibd_txcomp_poll == 0)
5812 			ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler,
5813 			    state);
5814 		else
5815 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5816 	}
5817 
5818 	for (i = 0; i < IBD_NUM_SWQE; i++)
5819 		ibd_free_swqe(state, swqes[i]);
5820 	(void) ibt_deregister_mr(state->id_hca_hdl, mrhdl);
5821 	kmem_free(sendbuf, IBD_TX_PKTSIZE);
5822 	kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5823 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5824 }
5825 
5826 static void
5827 ibd_perf_rx(ibd_state_t *state)
5828 {
5829 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5830 	if (wcs == NULL) {
5831 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5832 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5833 		return;
5834 	}
5835 
5836 	/*
5837 	 * We do not need to allocate private recv wqe's. We will
5838 	 * just use the regular ones.
5839 	 */
5840 
5841 	num_completions = 0;
5842 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state);
5843 
5844 	/*
5845 	 * Delay for a minute for all the packets to come in from
5846 	 * transmitter.
5847 	 */
5848 	cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE);
5849 	delay(drv_usectohz(60000000));
5850 	cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions);
5851 
5852 	/*
5853 	 * Reset CQ handler to real one; free resources.
5854 	 */
5855 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5856 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5857 }
5858 
5859 static void
5860 ibd_perf(ibd_state_t *state)
5861 {
5862 	if (ibd_performance == 0)
5863 		return;
5864 
5865 	if (ibd_receiver == 1) {
5866 		ibd_perf_rx(state);
5867 		return;
5868 	}
5869 
5870 	if (ibd_sender == 1) {
5871 		ibd_perf_tx(state);
5872 		return;
5873 	}
5874 }
5875 
5876 #endif /* RUN_PERFORMANCE */
5877