xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 7f7322febbcfe774b7270abc3b191c094bfcc517)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * An implementation of the IPoIB standard based on PSARC 2001/289.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/conf.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/modctl.h>
38 #include <sys/stropts.h>
39 #include <sys/stream.h>
40 #include <sys/strsun.h>
41 #include <sys/strsubr.h>
42 #include <sys/dlpi.h>
43 
44 #include <sys/pattr.h>		/* for HCK_PARTIALCKSUM */
45 #include <sys/sysmacros.h>	/* for offsetof */
46 #include <sys/disp.h>		/* for async thread pri */
47 #include <sys/atomic.h>		/* for atomic_add*() */
48 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
49 #include <netinet/in.h>		/* for netinet/ip.h below */
50 #include <netinet/ip.h>		/* for struct ip */
51 #include <netinet/udp.h>	/* for struct udphdr */
52 #include <inet/common.h>	/* for inet/ip.h below */
53 #include <inet/ip.h>		/* for ipha_t */
54 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
55 #include <inet/ip6.h>		/* for ip6_t */
56 #include <netinet/icmp6.h>	/* for icmp6_t */
57 #include <sys/callb.h>
58 #include <sys/modhash.h>
59 
60 #include <sys/ib/clients/ibd/ibd.h>
61 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
62 #include <sys/note.h>
63 #include <sys/pattr.h>
64 #include <sys/multidata.h>
65 
66 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
67 
68 /*
69  * Modes of hardware/driver/software checksum, useful for debugging
70  * and performance studies.
71  *
72  * none: h/w (Tavor) and driver does not do checksum, IP software must.
73  * partial: driver does data checksum, IP must provide psuedo header.
74  * perf_partial: driver uses IP provided psuedo cksum as data checksum
75  *		 (thus, real checksumming is not done).
76  */
77 typedef enum {
78 	IBD_CSUM_NONE,
79 	IBD_CSUM_PARTIAL,
80 	IBD_CSUM_PERF_PARTIAL
81 } ibd_csum_type_t;
82 
83 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t;
84 
85 /*
86  * Per interface tunable parameters.
87  */
88 static uint_t ibd_rx_threshold = 16;
89 static uint_t ibd_tx_current_copy_threshold = 0x10000000;
90 static uint_t ibd_num_rwqe = 4095;	/* 1 less than max Tavor CQsize */
91 static uint_t ibd_num_swqe = 4095;	/* 1 less than max Tavor CQsize */
92 static uint_t ibd_num_ah = 16;
93 static uint_t ibd_hash_size = 16;
94 static uint_t ibd_srv_fifos = 0xffff;
95 static uint_t ibd_fifo_depth = 0;
96 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE;
97 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE;
98 
99 /*
100  * The driver can use separate CQs for send and receive queueus.
101  * While using separate CQs, it is possible to put the send CQ
102  * in polling mode, ie not to enable notifications on that CQ.
103  * If both CQs are interrupt driven, currently it is not possible
104  * for their handlers to be invoked concurrently (since Tavor ties
105  * both interrupts to the same PCI intr line); but the handlers
106  * are not coded with a single interrupt cpu assumption (eg
107  * id_num_intrs is incremented atomically).
108  *
109  * The driver private struct uses id_scq_hdl to track the separate
110  * CQ being used for send; the id_rcq_hdl tracks the receive CQ
111  * if using separate CQs, or it tracks the single CQ when using
112  * combined CQ. The id_wcs completion array is used in the combined
113  * CQ case, and for fetching Rx completions in the separate CQs case;
114  * the id_txwcs is used to fetch Tx completions in the separate CQs
115  * case.
116  */
117 static uint_t ibd_separate_cqs = 1;
118 static uint_t ibd_txcomp_poll = 0;
119 
120 /*
121  * Initial number of IBA resources allocated.
122  */
123 #define	IBD_NUM_RWQE	ibd_num_rwqe
124 #define	IBD_NUM_SWQE	ibd_num_swqe
125 #define	IBD_NUM_AH	ibd_num_ah
126 
127 /* when <= threshold, it's faster to copy to a premapped buffer */
128 #define	IBD_TX_COPY_THRESHOLD	ibd_tx_current_copy_threshold
129 
130 /*
131  * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will
132  * allocate a new WQE to put on the the rxlist. This value must be <=
133  * IBD_NUM_RWQE/id_num_rwqe.
134  */
135 #define	IBD_RX_THRESHOLD	ibd_rx_threshold
136 
137 /*
138  * Hash table size for the active AH list.
139  */
140 #define	IBD_HASH_SIZE	ibd_hash_size
141 
142 /*
143  * Size of completion array to be filled by a single poll call.
144  */
145 #define	IBD_WC_SIZE	16
146 
147 /*
148  * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This
149  * is based on our above completion array size.
150  */
151 #define	IBD_TXPOLL_MASK	0xf
152 
153 /*
154  * Number of payload areas the MDT code can support. Choose the same value
155  * that we know is supported by TCP/MDT.
156  */
157 #define	IBD_MDTMAX_SEGS	16
158 
159 /*
160  * PAD routine called during send/recv context
161  */
162 #define	IBD_SEND	0
163 #define	IBD_RECV	1
164 
165 /* Driver State Pointer */
166 void *ibd_list;
167 
168 /* Required system entry points */
169 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
170 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
171 
172 /* Required driver entry points for GLD */
173 static int ibd_reset(gld_mac_info_t *);
174 static int ibd_start(gld_mac_info_t *);
175 static int ibd_stop(gld_mac_info_t *);
176 static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *);
177 static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int);
178 static int ibd_set_promiscuous(gld_mac_info_t *, int);
179 static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *);
180 static int ibd_send(gld_mac_info_t *, mblk_t *);
181 static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **);
182 static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *);
183 static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *);
184 static uint_t ibd_intr(gld_mac_info_t *);
185 
186 /* Private driver entry points for GLD */
187 static int ibd_state_init(ibd_state_t *, dev_info_t *);
188 static void ibd_state_fini(ibd_state_t *);
189 static int ibd_drv_init(ibd_state_t *);
190 static void ibd_drv_fini(ibd_state_t *);
191 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
192 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
193 static void ibd_snet_notices_handler(void *, ib_gid_t,
194     ibt_subnet_event_code_t, ibt_subnet_event_t *);
195 static int ibd_init_txlist(ibd_state_t *);
196 static void ibd_fini_txlist(ibd_state_t *);
197 static int ibd_init_rxlist(ibd_state_t *);
198 static void ibd_fini_rxlist(ibd_state_t *);
199 static void ibd_freemsg_cb(char *);
200 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t);
201 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
202 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **);
203 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
204 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
205 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
206 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
207     ibt_async_event_t *);
208 static int ibd_acache_init(ibd_state_t *);
209 static void ibd_acache_fini(ibd_state_t *);
210 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
211 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
212 static void ibd_async_unsetprom(ibd_state_t *, boolean_t);
213 static void ibd_async_setprom(ibd_state_t *, boolean_t);
214 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
215 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
216 static void ibd_async_txsched(ibd_state_t *);
217 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
218 static void ibd_async_work(ibd_state_t *);
219 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
220 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
221 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t);
222 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *,
223     ipoib_mac_t *);
224 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
225 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *);
226 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
227 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
228 static uint64_t ibd_get_portspeed(ibd_state_t *);
229 
230 #ifdef RUN_PERFORMANCE
231 static void ibd_perf(ibd_state_t *);
232 #endif
233 
234 /* Streams Module Info */
235 static struct module_info ibd_minfo = {
236 	IBD_IDNUM,		/* module ID Number */
237 	"ibd",			/* module name */
238 	0,			/* min packet size */
239 	INFPSZ,			/* maximum packet size */
240 	IBD_HIWAT,		/* high water mark */
241 	IBD_LOWAT		/* low water mark */
242 };
243 
244 /* Streams Read Queue */
245 static struct qinit ibd_rdinit = {
246 	NULL,			/* put */
247 	gld_rsrv,		/* service */
248 	gld_open,		/* open */
249 	gld_close,		/* close */
250 	NULL,			/* unused */
251 	&ibd_minfo,		/* parameters */
252 	NULL			/* statistics */
253 };
254 
255 /* Streams Write Queue */
256 static struct qinit ibd_wrinit = {
257 	gld_wput,		/* put */
258 	gld_wsrv,		/* service */
259 	NULL,			/* open */
260 	NULL,			/* close */
261 	NULL,			/* unused */
262 	&ibd_minfo,		/* parameters */
263 	NULL			/* statistics */
264 };
265 
266 /* Stream Operations */
267 static struct streamtab ibd_streamtab = {
268 	&ibd_rdinit,		/* read queue */
269 	&ibd_wrinit,		/* write queue */
270 	NULL,			/* lower read queue (MUX) */
271 	NULL			/* lower write queue (MUX) */
272 };
273 
274 /* Character/Block Operations */
275 static struct cb_ops ibd_cb_ops = {
276 	nulldev,		/* open */
277 	nulldev,		/* close */
278 	nodev,			/* strategy (block) */
279 	nodev,			/* print (block) */
280 	nodev,			/* dump (block) */
281 	nodev,			/* read */
282 	nodev,			/* write */
283 	nodev,			/* ioctl */
284 	nodev,			/* devmap */
285 	nodev,			/* mmap */
286 	nodev,			/* segmap */
287 	nochpoll,		/* chpoll */
288 	ddi_prop_op,		/* prop_op */
289 	&ibd_streamtab,		/* streams */
290 	D_MP | D_64BIT,		/* flags */
291 	CB_REV			/* rev */
292 };
293 
294 /* Driver Operations */
295 static struct dev_ops ibd_dev_ops = {
296 	DEVO_REV,		/* struct rev */
297 	0,			/* refcnt */
298 	gld_getinfo,		/* getinfo */
299 	nulldev,		/* identify */
300 	nulldev,		/* probe */
301 	ibd_attach,		/* attach */
302 	ibd_detach,		/* detach */
303 	nodev,			/* reset */
304 	&ibd_cb_ops,		/* cb_ops */
305 	NULL,			/* bus_ops */
306 	nodev			/* power */
307 };
308 
309 /* Module Driver Info */
310 static struct modldrv ibd_modldrv = {
311 	&mod_driverops,
312 	"InfiniBand DLPI Driver %I%",
313 	&ibd_dev_ops
314 };
315 
316 /* Module Linkage */
317 static struct modlinkage ibd_modlinkage = {
318 	MODREV_1,
319 	&ibd_modldrv,
320 	NULL
321 };
322 
323 /*
324  * Module Info passed to IBTL during IBT_ATTACH.
325  *   NOTE:  This data must be static (i.e. IBTL just keeps a pointer to this
326  *	    data).
327  */
328 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
329 	IBTI_V2,
330 	IBT_NETWORK,
331 	ibd_async_handler,
332 	NULL,
333 	"IPIB"
334 };
335 
336 /*
337  * Async operation types.
338  */
339 #define	ASYNC_GETAH	1
340 #define	ASYNC_JOIN	2
341 #define	ASYNC_LEAVE	3
342 #define	ASYNC_PROMON	4
343 #define	ASYNC_PROMOFF	5
344 #define	ASYNC_REAP	6
345 #define	ASYNC_POKE	7
346 #define	ASYNC_TRAP	8
347 #define	ASYNC_SCHED	9
348 #define	ASYNC_LINK	10
349 #define	ASYNC_EXIT	11
350 
351 /*
352  * Async operation states
353  */
354 #define	NOTSTARTED	0
355 #define	ONGOING		1
356 #define	COMPLETED	2
357 #define	ERRORED		3
358 #define	ROUTERED	4
359 
360 #define	IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF
361 
362 #ifdef DEBUG
363 
364 static int rxpack = 1, txpack = 1;
365 int debuglevel = 100;
366 static void
367 debug_print(int l, char *fmt, ...)
368 {
369 	va_list ap;
370 
371 	if (l < debuglevel)
372 		return;
373 	va_start(ap, fmt);
374 	vcmn_err(CE_CONT, fmt, ap);
375 	va_end(ap);
376 }
377 #define	INCRXPACK	(rxpack++)
378 #define	INCTXPACK	(txpack++)
379 #define	DPRINT		debug_print
380 
381 #else /* DEBUG */
382 
383 #define	INCRXPACK	0
384 #define	INCTXPACK	0
385 #define	DPRINT
386 
387 #endif /* DEBUG */
388 
389 /*
390  * Common routine to print warning messages; adds in hca guid, port number
391  * and pkey to be able to identify the IBA interface.
392  */
393 static void
394 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
395 {
396 	ib_guid_t hca_guid;
397 	char ibd_print_buf[256];
398 	int len;
399 	va_list ap;
400 
401 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
402 	    0, "hca-guid", 0);
403 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
404 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname,
405 	    state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid,
406 	    state->id_port, state->id_pkey);
407 	va_start(ap, fmt);
408 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
409 	    fmt, ap);
410 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
411 	va_end(ap);
412 }
413 
414 /* warlock directives */
415 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
416     ibd_state_t::id_ah_active))
417 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free))
418 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
419     ibd_state_t::id_req_list))
420 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
421     ibd_state_t::id_acache_req_cv))
422 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
423     ibd_state_t::id_multi_req))
424 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
425     ibd_state_t::id_multi_addr))
426 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
427     ibd_state_t::id_multi_op))
428 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
429     ibd_state_t::id_multi_queued))
430 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
431     ibd_state_t::id_mc_full))
432 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
433     ibd_state_t::id_mc_non))
434 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
435     ibd_state_t::id_link_state))
436 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
437     ibd_state_s::id_tx_list))
438 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
439     ibd_state_s::id_rx_list))
440 
441 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op))
442 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error))
443 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op))
444 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs))
445 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op))
446 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short))
447 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list))
448 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list))
449 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op))
450 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid))
451 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr))
452 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce))
453 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref))
454 
455 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s))
456 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s))
457 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s))
458 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac))
459 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh))
460 
461 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s))
462 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req))
463 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap))
464 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate))
465 
466 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr))
467 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr))
468 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats))
469 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id))
470 
471 #ifdef DEBUG
472 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack))
473 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack))
474 #endif
475 
476 int
477 _init()
478 {
479 	int status;
480 
481 	/*
482 	 * Sanity check some parameter settings. Tx completion polling
483 	 * only makes sense with separate CQs for Tx and Rx.
484 	 */
485 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
486 		cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname,
487 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
488 		ibd_txcomp_poll = 0;
489 	}
490 
491 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
492 	if (status != 0) {
493 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
494 		return (status);
495 	}
496 
497 	status = mod_install(&ibd_modlinkage);
498 	if (status != 0) {
499 		DPRINT(10, "_init:failed in mod_install()");
500 		ddi_soft_state_fini(&ibd_list);
501 		return (status);
502 	}
503 
504 	return (0);
505 }
506 
507 int
508 _info(struct modinfo *modinfop)
509 {
510 	return (mod_info(&ibd_modlinkage, modinfop));
511 }
512 
513 int
514 _fini()
515 {
516 	int status;
517 
518 	status = mod_remove(&ibd_modlinkage);
519 	if (status != 0)
520 		return (status);
521 
522 	ddi_soft_state_fini(&ibd_list);
523 	return (0);
524 }
525 
526 /*
527  * Convert the GID part of the mac address from network byte order
528  * to host order.
529  */
530 static void
531 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
532 {
533 	ib_sn_prefix_t nbopref;
534 	ib_guid_t nboguid;
535 
536 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
537 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
538 	dgid->gid_prefix = b2h64(nbopref);
539 	dgid->gid_guid = b2h64(nboguid);
540 }
541 
542 /*
543  * Create the IPoIB address in network byte order from host order inputs.
544  */
545 static void
546 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
547     ib_guid_t guid)
548 {
549 	ib_sn_prefix_t nbopref;
550 	ib_guid_t nboguid;
551 
552 	mac->ipoib_qpn = htonl(qpn);
553 	nbopref = h2b64(prefix);
554 	nboguid = h2b64(guid);
555 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
556 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
557 }
558 
559 /*
560  * Send to the appropriate all-routers group when the IBA multicast group
561  * does not exist, based on whether the target group is v4 or v6.
562  */
563 static boolean_t
564 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
565     ipoib_mac_t *rmac)
566 {
567 	boolean_t retval = B_TRUE;
568 	uint32_t adjscope = state->id_scope << 16;
569 	uint32_t topword;
570 
571 	/*
572 	 * Copy the first 4 bytes in without assuming any alignment of
573 	 * input mac address; this will have IPoIB signature, flags and
574 	 * scope bits.
575 	 */
576 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
577 	topword = ntohl(topword);
578 
579 	/*
580 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
581 	 */
582 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
583 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
584 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
585 		    ((uint32_t)(state->id_pkey << 16))),
586 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
587 	else
588 		/*
589 		 * Does not have proper bits in the mgid address.
590 		 */
591 		retval = B_FALSE;
592 
593 	return (retval);
594 }
595 
596 /*
597  * Implementation of various (software) flavors of send and receive side
598  * checksumming.
599  */
600 #define	IBD_CKSUM_SEND(mp) {						\
601 	uint32_t start, stuff, end, value, flags;			\
602 	uint32_t cksum, sum;						\
603 	uchar_t *dp, *buf;						\
604 	uint16_t *up;							\
605 									\
606 	if (ibd_csum_send == IBD_CSUM_NONE)				\
607 		goto punt_send;						\
608 									\
609 	/*								\
610 	 * Query IP whether Tx cksum needs to be done.			\
611 	 */								\
612 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end,		\
613 	    &value, &flags);						\
614 									\
615 	if (flags == HCK_PARTIALCKSUM)	{				\
616 		dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE);		\
617 		up =  (uint16_t *)(dp + stuff);				\
618 		if (ibd_csum_send == IBD_CSUM_PARTIAL) {		\
619 			end = ((uchar_t *)mp->b_wptr - dp - start);	\
620 			cksum = *up;					\
621 			*up = 0;					\
622 			/*						\
623 			 * Does NOT handle chained mblks/more than one	\
624 			 * SGL. Applicable only for a single SGL	\
625 			 * entry/mblk, where the stuff offset is	\
626 			 * within the range of buf.			\
627 			 */						\
628 			buf = (dp + start);				\
629 			sum = IP_BCSUM_PARTIAL(buf, end, cksum);	\
630 		} else {						\
631 			sum = *up;					\
632 		}							\
633 		DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n",	\
634 		    start, stuff, end, sum, cksum);			\
635 		sum = ~(sum);						\
636 		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
637 	}								\
638 punt_send:								\
639 	;								\
640 }
641 
642 #define	IBD_CKSUM_RECV(mp) {						\
643 	uchar_t *dp, *buf;						\
644 	uint32_t start, end, value, stuff, flags;			\
645 	uint16_t *up, frag;						\
646 	ipha_t *iphp;							\
647 	ipoib_hdr_t *ipibh;						\
648 									\
649 	if (ibd_csum_recv == IBD_CSUM_NONE)				\
650 		goto punt_recv;					 	\
651 									\
652 	ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\
653 	if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP)		 	\
654 		goto punt_recv;						\
655 									\
656 	dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE);			\
657 	iphp = (ipha_t *)dp;						\
658 	frag = ntohs(iphp->ipha_fragment_offset_and_flags);		\
659 	if ((frag) & (~IPH_DF))						\
660 		goto punt_recv;						\
661 	start = IPH_HDR_LENGTH(iphp);					\
662 	if (iphp->ipha_protocol == IPPROTO_TCP)				\
663 		stuff = start + 16;					\
664 	else if (iphp->ipha_protocol == IPPROTO_UDP)			\
665 		stuff = start + 6;					\
666 	else								\
667 		goto punt_recv;						\
668 									\
669 	flags = HCK_PARTIALCKSUM;					\
670 	end = ntohs(iphp->ipha_length);					\
671 	up = (uint16_t *)(dp + stuff);					\
672 									\
673 	if (ibd_csum_recv == IBD_CSUM_PARTIAL) {			\
674 		buf = (dp + start);					\
675 		value = IP_BCSUM_PARTIAL(buf, end - start, 0);		\
676 	} else {							\
677 		value = (*up);						\
678 	}								\
679 	if (hcksum_assoc(mp, NULL, NULL, start, stuff, end,		\
680 	    value, flags, 0) != 0)					\
681 		DPRINT(10, "cksum_recv: value: %x\n", value);		\
682 punt_recv:								\
683 	;								\
684 }
685 
686 #define	IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) {		\
687 	/*								\
688 	 * Query IP whether Tx cksum needs to be done.			\
689 	 */								\
690 	if (ibd_csum_send != IBD_CSUM_NONE)				\
691 		hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp);	\
692 }
693 
694 #define	IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) {			\
695 	if ((ibd_csum_send != IBD_CSUM_NONE) &&				\
696 	    (fl == HCK_PARTIALCKSUM)) {					\
697 		extern uint_t bcksum(uchar_t *, int, uint32_t);		\
698 		uint16_t *up;						\
699 		uint32_t sum;						\
700 		uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE;	\
701 		int k;							\
702 									\
703 		up = (uint16_t *)(hp + stf);				\
704 		if (ibd_csum_send == IBD_CSUM_PARTIAL) {		\
705 			sum = *up;					\
706 			*up = 0;					\
707 			sum = IP_BCSUM_PARTIAL(hp + st,			\
708 			    PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE,	\
709 			    sum);					\
710 			for (k = 0; k < pinfo->pld_cnt; k++)		\
711 				sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\
712 				    pld_rptr, PDESC_PLDL(pinfo, k),	\
713 				    sum);				\
714 		} else {						\
715 			sum = *up;					\
716 		}							\
717 		sum = ~(sum);						\
718 		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
719 	}								\
720 }
721 
722 /*
723  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
724  * front of optional src/tgt link layer address. Right now Solaris inserts
725  * padding by default at the end. The routine which is doing is nce_xmit()
726  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
727  * the packet comes down from IP layer to the IBD driver, it is in the
728  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
729  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
730  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
731  *
732  * The send routine at IBD driver changes this packet as follows:
733  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
734  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
735  * aligned.
736  *
737  * At the receiving side again ibd_process_rx takes the above packet and
738  * removes the two bytes of front padding and inserts it at the end. This
739  * is since the IP layer does not understand padding at the front.
740  */
741 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
742 	uchar_t 	*nd_lla_ptr;					\
743 	icmp6_t 	*icmp6;						\
744 	nd_opt_hdr_t	*opt;						\
745 	int 		i;						\
746 									\
747 	icmp6 = (icmp6_t *)&ip6h[1];					\
748 	len -= sizeof (nd_neighbor_advert_t);				\
749 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
750 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
751 	    (len != 0)) {						\
752 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
753 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
754 		ASSERT(opt != NULL);					\
755 		nd_lla_ptr = (uchar_t *)&opt[1];			\
756 		if (type == 0) {					\
757 			for (i = IPOIB_ADDRL; i > 0; i--)		\
758 				*(nd_lla_ptr + i + 1) =			\
759 				    *(nd_lla_ptr + i - 1);		\
760 		} else {						\
761 			for (i = 0; i < IPOIB_ADDRL; i++)		\
762 				*(nd_lla_ptr + i) =			\
763 				    *(nd_lla_ptr + i + 2);		\
764 		}							\
765 		*(nd_lla_ptr + i) = 0;					\
766 		*(nd_lla_ptr + i + 1) = 0;				\
767 	}								\
768 }
769 
770 /*
771  * The service fifo code is copied verbatim from Cassini. This can be
772  * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu.
773  */
774 
775 typedef caddr_t fifo_obj_t, *p_fifo_obj_t;
776 
777 typedef struct _srv_fifo_t {
778 	kmutex_t fifo_lock;
779 	kcondvar_t fifo_cv;
780 	size_t size;
781 	uint_t max_index;
782 	uint_t rd_index;
783 	uint_t wr_index;
784 	uint_t objs_pending;
785 	p_fifo_obj_t fifo_objs;
786 	kthread_t *fifo_thread;
787 	void (*drain_func)(caddr_t drain_func_arg);
788 	caddr_t drain_func_arg;
789 	boolean_t running;
790 	callb_cpr_t cprinfo;
791 } srv_fifo_t, *p_srv_fifo_t;
792 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv))
793 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo))
794 
795 static int
796 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size,
797 			void (*drain_func)(), caddr_t drain_func_arg)
798 {
799 	int status;
800 	p_srv_fifo_t srv_fifo;
801 
802 	status = DDI_SUCCESS;
803 	srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP);
804 	srv_fifo->size = size;
805 	srv_fifo->max_index = size - 1;
806 	srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc(
807 				size * sizeof (fifo_obj_t), KM_SLEEP);
808 	mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL);
809 	cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL);
810 	srv_fifo->drain_func = drain_func;
811 	srv_fifo->drain_func_arg = drain_func_arg;
812 	srv_fifo->running = DDI_SUCCESS;
813 	srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func,
814 				(caddr_t)srv_fifo, 0, &p0, TS_RUN, 60);
815 	if (srv_fifo->fifo_thread == NULL) {
816 		cv_destroy(&srv_fifo->fifo_cv);
817 		mutex_destroy(&srv_fifo->fifo_lock);
818 		kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t));
819 		kmem_free(srv_fifo, sizeof (srv_fifo_t));
820 		srv_fifo = NULL;
821 		status = DDI_FAILURE;
822 	} else
823 		*handle = srv_fifo;
824 	return (status);
825 }
826 
827 static void
828 _ddi_srv_fifo_destroy(p_srv_fifo_t handle)
829 {
830 	kt_did_t tid = handle->fifo_thread->t_did;
831 
832 	mutex_enter(&handle->fifo_lock);
833 	handle->running = DDI_FAILURE;
834 	cv_signal(&handle->fifo_cv);
835 	while (handle->running == DDI_FAILURE)
836 		cv_wait(&handle->fifo_cv, &handle->fifo_lock);
837 	mutex_exit(&handle->fifo_lock);
838 	if (handle->objs_pending != 0)
839 		cmn_err(CE_NOTE, "!Thread Exit with work undone.");
840 	cv_destroy(&handle->fifo_cv);
841 	mutex_destroy(&handle->fifo_lock);
842 	kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t));
843 	kmem_free(handle, sizeof (srv_fifo_t));
844 	thread_join(tid);
845 }
846 
847 static caddr_t
848 _ddi_srv_fifo_begin(p_srv_fifo_t handle)
849 {
850 #ifndef __lock_lint
851 	CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock,
852 			callb_generic_cpr, "srv_fifo");
853 #endif /* ! _lock_lint */
854 	return (handle->drain_func_arg);
855 }
856 
857 static void
858 _ddi_srv_fifo_end(p_srv_fifo_t handle)
859 {
860 	callb_cpr_t cprinfo;
861 
862 	mutex_enter(&handle->fifo_lock);
863 	cprinfo = handle->cprinfo;
864 	handle->running = DDI_SUCCESS;
865 	cv_signal(&handle->fifo_cv);
866 #ifndef __lock_lint
867 	CALLB_CPR_EXIT(&cprinfo);
868 #endif /* ! _lock_lint */
869 	thread_exit();
870 	_NOTE(NOT_REACHED)
871 }
872 
873 static int
874 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal)
875 {
876 	int status;
877 
878 	mutex_enter(&handle->fifo_lock);
879 	status = handle->running;
880 	if (status == DDI_SUCCESS) {
881 		if (ptr) {
882 			if (handle->objs_pending < handle->size) {
883 				if (handle->wr_index == handle->max_index)
884 					handle->wr_index = 0;
885 				else
886 					handle->wr_index++;
887 				handle->fifo_objs[handle->wr_index] = ptr;
888 				handle->objs_pending++;
889 			} else
890 				status = DDI_FAILURE;
891 			if (signal)
892 				cv_signal(&handle->fifo_cv);
893 		} else {
894 			if (signal && (handle->objs_pending > 0))
895 				cv_signal(&handle->fifo_cv);
896 		}
897 	}
898 	mutex_exit(&handle->fifo_lock);
899 	return (status);
900 }
901 
902 static int
903 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr)
904 {
905 	int status;
906 
907 	mutex_enter(&handle->fifo_lock);
908 	status = handle->running;
909 	if (status == DDI_SUCCESS) {
910 		if (handle->objs_pending == 0) {
911 #ifndef __lock_lint
912 			CALLB_CPR_SAFE_BEGIN(&handle->cprinfo);
913 			cv_wait(&handle->fifo_cv, &handle->fifo_lock);
914 			CALLB_CPR_SAFE_END(&handle->cprinfo,
915 						&handle->fifo_lock);
916 #endif /* !_lock_lint */
917 			*ptr = NULL;
918 		}
919 		if (handle->objs_pending > 0) {
920 			if (handle->rd_index == handle->max_index)
921 				handle->rd_index = 0;
922 			else
923 				handle->rd_index++;
924 			*ptr = handle->fifo_objs[handle->rd_index];
925 			handle->objs_pending--;
926 		}
927 		status = handle->running;
928 	} else {
929 		if (handle->objs_pending) {
930 			if (handle->rd_index == handle->max_index)
931 				handle->rd_index = 0;
932 			else
933 				handle->rd_index++;
934 			*ptr = handle->fifo_objs[handle->rd_index];
935 			handle->objs_pending--;
936 			status = DDI_SUCCESS;
937 		} else
938 			status = DDI_FAILURE;
939 	}
940 	mutex_exit(&handle->fifo_lock);
941 	return (status);
942 }
943 
944 /*
945  * [un]map_rx_srv_fifos has been modified from its CE version.
946  */
947 static void
948 drain_fifo(p_srv_fifo_t handle)
949 {
950 	ibd_state_t *state;
951 	mblk_t *mp;
952 
953 	state = (ibd_state_t *)_ddi_srv_fifo_begin(handle);
954 	while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) {
955 		/*
956 		 * Hand off to GLD.
957 		 */
958 		IBD_CKSUM_RECV(mp);
959 		gld_recv(state->id_macinfo, mp);
960 	}
961 	_ddi_srv_fifo_end(handle);
962 }
963 
964 static p_srv_fifo_t *
965 map_rx_srv_fifos(int *nfifos, void *private)
966 {
967 	p_srv_fifo_t *srv_fifos;
968 	int i, inst_taskqs, depth;
969 
970 	/*
971 	 * Default behavior on sparc cpus (with lower cpu frequency) is
972 	 * to use service fifo if ncpus > 1 and not to use service fifo
973 	 * on single cpu systems; on intel/amd cpus (with higher cpu
974 	 * frequency), the default is never to use service fifos. This
975 	 * can be changed by tweaking ibd_srv_fifos (set to 0 or 1
976 	 * by administrator). On single cpu systems, network
977 	 * processing is given lower priority if using service
978 	 * threads, thus possibly making the system more usable
979 	 * at high network loads (maybe by throttling network
980 	 * throughput).
981 	 */
982 	if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) ||
983 #if !defined(__sparc)
984 	    (ibd_srv_fifos == 0xffff) ||
985 #endif
986 	    (ibd_srv_fifos == 0)) {
987 		*nfifos = 0;
988 		return ((p_srv_fifo_t *)1);
989 	}
990 
991 	*nfifos = inst_taskqs;
992 	srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t),
993 	    KM_SLEEP);
994 
995 	/*
996 	 * If the administrator has specified a fifo depth, use
997 	 * that, else just decide what should be the depth.
998 	 */
999 	if (ibd_fifo_depth == 0)
1000 		depth = (IBD_NUM_RWQE / inst_taskqs) + 16;
1001 	else
1002 		depth = ibd_fifo_depth;
1003 
1004 	for (i = 0; i < inst_taskqs; i++)
1005 		if (_ddi_srv_fifo_create(&srv_fifos[i],
1006 		    depth, drain_fifo,
1007 		    (caddr_t)private) != DDI_SUCCESS)
1008 			break;
1009 
1010 	if (i < inst_taskqs)
1011 		goto map_rx_srv_fifos_fail1;
1012 
1013 	goto map_rx_srv_fifos_exit;
1014 
1015 map_rx_srv_fifos_fail1:
1016 	i--;
1017 	for (; i >= 0; i--) {
1018 		_ddi_srv_fifo_destroy(srv_fifos[i]);
1019 	}
1020 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
1021 	srv_fifos = NULL;
1022 
1023 map_rx_srv_fifos_exit:
1024 	return (srv_fifos);
1025 }
1026 
1027 static void
1028 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos)
1029 {
1030 	int i;
1031 
1032 	/*
1033 	 * If this interface was not using service fifos, quickly return.
1034 	 */
1035 	if (inst_taskqs == 0)
1036 		return;
1037 
1038 	for (i = 0; i < inst_taskqs; i++) {
1039 		_ddi_srv_fifo_destroy(srv_fifos[i]);
1040 	}
1041 	kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t));
1042 }
1043 
1044 /*
1045  * Choose between sending up the packet directly and handing off
1046  * to a service thread.
1047  */
1048 static void
1049 ibd_send_up(ibd_state_t *state, mblk_t *mp)
1050 {
1051 	p_srv_fifo_t *srvfifo;
1052 	ipoib_hdr_t *lhdr;
1053 	struct ip *ip_hdr;
1054 	struct udphdr *tran_hdr;
1055 	uchar_t prot;
1056 	int tnum = -1, nfifos = state->id_nfifos;
1057 
1058 	/*
1059 	 * Quick path if the interface is not using service fifos.
1060 	 */
1061 	if (nfifos == 0) {
1062 hand_off:
1063 		IBD_CKSUM_RECV(mp);
1064 		gld_recv(state->id_macinfo, mp);
1065 		return;
1066 	}
1067 
1068 	/*
1069 	 * Is the packet big enough to look at the IPoIB header
1070 	 * and basic IP header to determine whether it is an
1071 	 * IPv4 packet?
1072 	 */
1073 	if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE +
1074 	    sizeof (struct ip))) {
1075 
1076 		lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE);
1077 
1078 		/*
1079 		 * Is the packet an IP(v4) packet?
1080 		 */
1081 		if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) {
1082 
1083 			ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE +
1084 			    IPOIB_HDRSIZE);
1085 			prot = ip_hdr->ip_p;
1086 
1087 			/*
1088 			 * TCP or UDP packet? We use the UDP header, since
1089 			 * the first few words of both headers are laid out
1090 			 * similarly (src/dest ports).
1091 			 */
1092 			if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) {
1093 
1094 				tran_hdr = (struct udphdr *)(
1095 				    (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2));
1096 
1097 				/*
1098 				 * Are we within limits of this packet? If
1099 				 * so, use the destination port to hash to
1100 				 * a service thread.
1101 				 */
1102 				if (mp->b_wptr >= ((uchar_t *)tran_hdr +
1103 				    sizeof (*tran_hdr)))
1104 					tnum = (ntohs(tran_hdr->uh_dport) +
1105 					    ntohs(tran_hdr->uh_sport)) %
1106 					    nfifos;
1107 			}
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the
1113 	 * packet up in interrupt context, reducing latency.
1114 	 */
1115 	if (tnum == -1) {
1116 		goto hand_off;
1117 	}
1118 
1119 	srvfifo = (p_srv_fifo_t *)state->id_fifos;
1120 	if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp,
1121 	    B_TRUE) != DDI_SUCCESS)
1122 		freemsg(mp);
1123 }
1124 
1125 /*
1126  * Address handle entries maintained by the driver are kept in the
1127  * free and active lists. Each entry starts out in the free list;
1128  * it migrates to the active list when primed using ibt_get_paths()
1129  * and ibt_modify_ud_dest() for transmission to a specific destination.
1130  * In the active list, the entry has a reference count indicating the
1131  * number of ongoing/uncompleted transmits that reference it. The
1132  * entry is left in the active list even after the reference count
1133  * goes to 0, since successive transmits can find it there and do
1134  * not need to set up another entry (ie the path information is
1135  * cached using the active list). Entries on the active list are
1136  * also hashed using the destination link address as a key for faster
1137  * lookups during transmits.
1138  *
1139  * For any destination address (unicast or multicast, whatever the
1140  * join states), there will be at most one entry in the active list.
1141  * Entries with a 0 reference count on the active list can be reused
1142  * for a transmit to a new destination, if the free list is empty.
1143  *
1144  * The AH free list insertion/deletion is protected with the id_ac_mutex,
1145  * since the async thread and Tx callback handlers insert/delete. The
1146  * active list does not need a lock (all operations are done by the
1147  * async thread) but updates to the reference count are atomically
1148  * done (increments done by Tx path, decrements by the Tx callback handler).
1149  */
1150 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
1151 	list_insert_head(&state->id_ah_free, ce)
1152 #define	IBD_ACACHE_GET_FREE(state) \
1153 	list_get_head(&state->id_ah_free)
1154 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
1155 	int _ret_;						\
1156 	list_insert_head(&state->id_ah_active, ce);		\
1157 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
1158 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1159 	ASSERT(_ret_ == 0);					\
1160 }
1161 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
1162 	list_remove(&state->id_ah_active, ce);			\
1163 	(void) mod_hash_remove(state->id_ah_active_hash,	\
1164 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
1165 }
1166 #define	IBD_ACACHE_GET_ACTIVE(state) \
1167 	list_get_head(&state->id_ah_active)
1168 
1169 /*
1170  * Membership states for different mcg's are tracked by two lists:
1171  * the "non" list is used for promiscuous mode, when all mcg traffic
1172  * needs to be inspected. This type of membership is never used for
1173  * transmission, so there can not be an AH in the active list
1174  * corresponding to a member in this list. This list does not need
1175  * any protection, since all operations are performed by the async
1176  * thread.
1177  *
1178  * "Full" and "SendOnly" membership is tracked using a single list,
1179  * the "full" list. This is because this single list can then be
1180  * searched during transmit to a multicast group (if an AH for the
1181  * mcg is not found in the active list), since at least one type
1182  * of membership must be present before initiating the transmit.
1183  * This list is also emptied during driver detach, since sendonly
1184  * membership acquired during transmit is dropped at detach time
1185  * alongwith ipv4 broadcast full membership. Insert/deletes to
1186  * this list are done only by the async thread, but it is also
1187  * searched in program context (see multicast disable case), thus
1188  * the id_mc_mutex protects the list. The driver detach path also
1189  * deconstructs the "full" list, but it ensures that the async
1190  * thread will not be accessing the list (by blocking out mcg
1191  * trap handling and making sure no more Tx reaping will happen).
1192  *
1193  * Currently, an IBA attach is done in the SendOnly case too,
1194  * although this is not required.
1195  */
1196 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1197 	list_insert_head(&state->id_mc_full, mce)
1198 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1199 	list_insert_head(&state->id_mc_non, mce)
1200 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1201 	ibd_mcache_find(mgid, &state->id_mc_full)
1202 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1203 	ibd_mcache_find(mgid, &state->id_mc_non)
1204 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1205 	list_remove(&state->id_mc_full, mce)
1206 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1207 	list_remove(&state->id_mc_non, mce)
1208 
1209 /*
1210  * AH and MCE active list manipulation:
1211  *
1212  * Multicast disable requests and MCG delete traps are two cases
1213  * where the active AH entry for the mcg (if any unreferenced one exists)
1214  * will be moved to the free list (to force the next Tx to the mcg to
1215  * join the MCG in SendOnly mode). Port up handling will also move AHs
1216  * from active to free list.
1217  *
1218  * In the case when some transmits are still pending on an entry
1219  * for an mcg, but a multicast disable has already been issued on the
1220  * mcg, there are some options to consider to preserve the join state
1221  * to ensure the emitted packet is properly routed on the IBA fabric.
1222  * For the AH, we can
1223  * 1. take out of active list at multicast disable time.
1224  * 2. take out of active list only when last pending Tx completes.
1225  * For the MCE, we can
1226  * 3. take out of active list at multicast disable time.
1227  * 4. take out of active list only when last pending Tx completes.
1228  * 5. move from active list to stale list at multicast disable time.
1229  * We choose to use 2,4. We use option 4 so that if a multicast enable
1230  * is tried before the pending Tx completes, the enable code finds the
1231  * mce in the active list and just has to make sure it will not be reaped
1232  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1233  * a stale list (#5) that would be checked in the enable code would need
1234  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1235  * after the multicast disable would try to put an AH in the active list,
1236  * and associate the mce it finds in the active list to this new AH,
1237  * whereas the mce is already associated with the previous AH (taken off
1238  * the active list), and will be removed once the pending Tx's complete
1239  * (unless a reference count on mce's is implemented). One implication of
1240  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1241  * grab new references on the AH, further delaying the leave.
1242  *
1243  * In the case of mcg delete (or create) trap when the port is sendonly
1244  * joined, the AH and MCE handling is different: the AH and MCE has to be
1245  * immediately taken off the active lists (forcing a join and path lookup
1246  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1247  * to an mcg as it is repeatedly created and deleted and goes thru
1248  * reincarnations).
1249  *
1250  * When a port is already sendonly joined, and a multicast enable is
1251  * attempted, the same mce structure is promoted; this ensures only a
1252  * single mce on the active list tracks the most powerful join state.
1253  *
1254  * In the case of port up event handling, the MCE for sendonly membership
1255  * is freed up, and the ACE is put into the free list as soon as possible
1256  * (depending on whether posted Tx's have completed). For fullmembership
1257  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1258  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1259  * done; else the mce is deconstructed (mc_fullreap case).
1260  *
1261  * MCG creation and deletion trap handling:
1262  *
1263  * These traps are unreliable (meaning sometimes the trap might never
1264  * be delivered to the subscribed nodes) and may arrive out-of-order
1265  * since they use UD transport. An alternative to relying on these
1266  * unreliable traps is to poll for mcg presence every so often, but
1267  * instead of doing that, we try to be as conservative as possible
1268  * while handling the traps, and hope that the traps do arrive at
1269  * the subscribed nodes soon. Note that if a node is fullmember
1270  * joined to an mcg, it can not possibly receive a mcg create/delete
1271  * trap for that mcg (by fullmember definition); if it does, it is
1272  * an old trap from a previous incarnation of the mcg.
1273  *
1274  * Whenever a trap is received, the driver cleans up its sendonly
1275  * membership to the group; we choose to do a sendonly leave even
1276  * on a creation trap to handle the case of a prior deletion of the mcg
1277  * having gone unnoticed. Consider an example scenario:
1278  * T1: MCG M is deleted, and fires off deletion trap D1.
1279  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1280  * T3: Node N tries to transmit to M, joining in sendonly mode.
1281  * T4: MCG M is deleted, and fires off deletion trap D2.
1282  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1283  *     If the trap is D2, then a LEAVE is not required, since the mcg
1284  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1285  *     approach is to always LEAVE, but the SM may be confused if it
1286  *     receives a LEAVE without a prior JOIN.
1287  *
1288  * Management of the non-membership to an mcg is similar to the above,
1289  * except that if the interface is in promiscuous mode, it is required
1290  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1291  * if the re-join attempt fails (in which case a warning message needs
1292  * to be printed), it is not clear whether it failed due to the mcg not
1293  * existing, or some fabric/hca issues, due to the delayed nature of
1294  * trap delivery. Querying the SA to establish presence/absence of the
1295  * mcg is also racy at best. Thus, the driver just prints a warning
1296  * message when it can not rejoin after receiving a create trap, although
1297  * this might be (on rare occassions) a mis-warning if the create trap is
1298  * received after the mcg was deleted.
1299  */
1300 
1301 /*
1302  * Implementation of atomic "recycle" bits and reference count
1303  * on address handles. This utilizes the fact that max reference
1304  * count on any handle is limited by number of send wqes, thus
1305  * high bits in the ac_ref field can be used as the recycle bits,
1306  * and only the low bits hold the number of pending Tx requests.
1307  * This atomic AH reference counting allows the Tx completion
1308  * handler not to acquire the id_ac_mutex to process every completion,
1309  * thus reducing lock contention problems between completion and
1310  * the Tx path.
1311  */
1312 #define	CYCLEVAL		0x80000
1313 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1314 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1315 #define	GET_REF(ace)		((ace)->ac_ref)
1316 #define	GET_REF_CYCLE(ace) (				\
1317 	/*						\
1318 	 * Make sure "cycle" bit is set.		\
1319 	 */						\
1320 	ASSERT(CYCLE_SET(ace)),				\
1321 	((ace)->ac_ref & ~(CYCLEVAL))			\
1322 )
1323 #define	INC_REF(ace, num) {				\
1324 	atomic_add_32(&(ace)->ac_ref, num);		\
1325 }
1326 #define	SET_CYCLE_IF_REF(ace) (				\
1327 	CYCLE_SET(ace) ? B_TRUE :			\
1328 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1329 		CYCLEVAL ?				\
1330 		/*					\
1331 		 * Clear the "cycle" bit we just set;	\
1332 		 * ref count known to be 0 from above.	\
1333 		 */					\
1334 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1335 		/*					\
1336 		 * We set "cycle" bit; let caller know.	\
1337 		 */					\
1338 		B_TRUE					\
1339 )
1340 #define	DEC_REF_DO_CYCLE(ace) (				\
1341 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1342 	    CYCLEVAL ?					\
1343 		/*					\
1344 		 * Ref count known to be 0 from above.	\
1345 		 */					\
1346 		B_TRUE :				\
1347 		B_FALSE					\
1348 )
1349 
1350 static void *
1351 list_get_head(list_t *list)
1352 {
1353 	list_node_t *lhead = list_head(list);
1354 
1355 	if (lhead != NULL)
1356 		list_remove(list, lhead);
1357 	return (lhead);
1358 }
1359 
1360 /*
1361  * This is always guaranteed to be able to queue the work.
1362  */
1363 static void
1364 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1365 {
1366 	/* Initialize request */
1367 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1368 	ptr->rq_op = op;
1369 
1370 	/*
1371 	 * Queue provided slot onto request pool.
1372 	 */
1373 	mutex_enter(&state->id_acache_req_lock);
1374 	list_insert_tail(&state->id_req_list, ptr);
1375 
1376 	/* Go, fetch, async thread */
1377 	cv_signal(&state->id_acache_req_cv);
1378 	mutex_exit(&state->id_acache_req_lock);
1379 }
1380 
1381 /*
1382  * Main body of the per interface async thread.
1383  */
1384 static void
1385 ibd_async_work(ibd_state_t *state)
1386 {
1387 	ibd_req_t *ptr;
1388 	callb_cpr_t cprinfo;
1389 
1390 	mutex_enter(&state->id_acache_req_lock);
1391 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1392 	    callb_generic_cpr, "ibd_async_work");
1393 	for (;;) {
1394 		ptr = list_get_head(&state->id_req_list);
1395 		if (ptr != NULL) {
1396 			mutex_exit(&state->id_acache_req_lock);
1397 
1398 			/*
1399 			 * Once we have done the operation, there is no
1400 			 * guarantee the request slot is going to be valid,
1401 			 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP).
1402 			 */
1403 
1404 			/* Perform the request */
1405 			switch (ptr->rq_op) {
1406 				case ASYNC_GETAH:
1407 					ibd_async_acache(state, &ptr->rq_mac);
1408 					break;
1409 				case ASYNC_POKE:
1410 					/*
1411 					 * We need the gld_sched; that
1412 					 * happens below. No locks are
1413 					 * needed for the multi_op update.
1414 					 */
1415 					state->id_multi_op = NOTSTARTED;
1416 					break;
1417 				case ASYNC_REAP:
1418 					ibd_async_reap_group(state,
1419 					    ptr->rq_ptr, ptr->rq_gid,
1420 					    IB_MC_JSTATE_FULL);
1421 					break;
1422 				case ASYNC_LEAVE:
1423 				case ASYNC_JOIN:
1424 					ibd_async_multicast(state,
1425 					    ptr->rq_gid, ptr->rq_op);
1426 					break;
1427 				case ASYNC_PROMON:
1428 					ibd_async_setprom(state, B_TRUE);
1429 					break;
1430 				case ASYNC_PROMOFF:
1431 					ibd_async_unsetprom(state, B_TRUE);
1432 					break;
1433 				case ASYNC_TRAP:
1434 					ibd_async_trap(state, ptr);
1435 					break;
1436 				case ASYNC_SCHED:
1437 					ibd_async_txsched(state);
1438 					break;
1439 				case ASYNC_LINK:
1440 					ibd_async_link(state, ptr);
1441 					break;
1442 				case ASYNC_EXIT:
1443 					mutex_enter(&state->id_acache_req_lock);
1444 #ifndef	__lock_lint
1445 					CALLB_CPR_EXIT(&cprinfo);
1446 #endif /* !__lock_lint */
1447 					_NOTE(NOT_REACHED)
1448 					return;
1449 			}
1450 
1451 			/*
1452 			 * Indicate blocked operation can now be retried.
1453 			 * Note gld_sched() gets the gld_maclock,
1454 			 * and the multicast/promiscuous paths
1455 			 * (ibd_set_multicast(), ibd_set_promiscuous())
1456 			 * grab id_acache_req_lock in ibd_queue_work_slot()
1457 			 * with gld_maclock held, so we must not hold the
1458 			 * id_acache_req_lock while calling gld_sched to
1459 			 * prevent deadlock.
1460 			 */
1461 			gld_sched(state->id_macinfo);
1462 
1463 			mutex_enter(&state->id_acache_req_lock);
1464 		} else {
1465 			/*
1466 			 * Nothing to do: wait till new request arrives.
1467 			 */
1468 #ifndef __lock_lint
1469 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1470 			cv_wait(&state->id_acache_req_cv,
1471 			    &state->id_acache_req_lock);
1472 			CALLB_CPR_SAFE_END(&cprinfo,
1473 			    &state->id_acache_req_lock);
1474 #endif /* !_lock_lint */
1475 		}
1476 	}
1477 	/*NOTREACHED*/
1478 }
1479 
1480 /*
1481  * Return when it is safe to queue requests to the async daemon; primarily
1482  * for subnet trap and async event handling. Disallow requests before the
1483  * daemon is created, and when interface deinitilization starts.
1484  */
1485 static boolean_t
1486 ibd_async_safe(ibd_state_t *state)
1487 {
1488 	mutex_enter(&state->id_trap_lock);
1489 	if (state->id_trap_stop) {
1490 		mutex_exit(&state->id_trap_lock);
1491 		return (B_FALSE);
1492 	}
1493 	state->id_trap_inprog++;
1494 	mutex_exit(&state->id_trap_lock);
1495 	return (B_TRUE);
1496 }
1497 
1498 /*
1499  * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
1500  * trap or event handling to complete to kill the async thread and deconstruct
1501  * the mcg/ace list.
1502  */
1503 static void
1504 ibd_async_done(ibd_state_t *state)
1505 {
1506 	mutex_enter(&state->id_trap_lock);
1507 	if (--state->id_trap_inprog == 0)
1508 		cv_signal(&state->id_trap_cv);
1509 	mutex_exit(&state->id_trap_lock);
1510 }
1511 
1512 /*
1513  * Hash functions:
1514  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1515  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1516  * These operate on mac addresses input into ibd_send, but there is no
1517  * guarantee on the alignment of the ipoib_mac_t structure.
1518  */
1519 /*ARGSUSED*/
1520 static uint_t
1521 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1522 {
1523 	ulong_t ptraddr = (ulong_t)key;
1524 	uint_t hval;
1525 
1526 	/*
1527 	 * If the input address is 4 byte aligned, we can just dereference
1528 	 * it. This is most common, since IP will send in a 4 byte aligned
1529 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1530 	 * 4 byte aligned too.
1531 	 */
1532 	if ((ptraddr & 3) == 0)
1533 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1534 
1535 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1536 	return (hval);
1537 }
1538 
1539 static int
1540 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1541 {
1542 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1543 		return (0);
1544 	else
1545 		return (1);
1546 }
1547 
1548 /*
1549  * Initialize all the per interface caches and lists; AH cache,
1550  * MCG list etc.
1551  */
1552 static int
1553 ibd_acache_init(ibd_state_t *state)
1554 {
1555 	ibd_ace_t *ce;
1556 	int i;
1557 
1558 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1559 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1560 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1561 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1562 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1563 	    offsetof(ibd_ace_t, ac_list));
1564 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1565 	    offsetof(ibd_ace_t, ac_list));
1566 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1567 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1568 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1569 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1570 	    offsetof(ibd_mce_t, mc_list));
1571 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1572 	    offsetof(ibd_mce_t, mc_list));
1573 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1574 	    offsetof(ibd_req_t, rq_list));
1575 
1576 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1577 	    IBD_NUM_AH, KM_SLEEP);
1578 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1579 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1580 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1581 			ibd_acache_fini(state);
1582 			return (DDI_FAILURE);
1583 		} else {
1584 			CLEAR_REFCYCLE(ce);
1585 			ce->ac_mce = NULL;
1586 			IBD_ACACHE_INSERT_FREE(state, ce);
1587 		}
1588 	}
1589 	return (DDI_SUCCESS);
1590 }
1591 
1592 static void
1593 ibd_acache_fini(ibd_state_t *state)
1594 {
1595 	ibd_ace_t *ptr;
1596 
1597 	mutex_enter(&state->id_ac_mutex);
1598 
1599 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1600 		ASSERT(GET_REF(ptr) == 0);
1601 		(void) ibt_free_ud_dest(ptr->ac_dest);
1602 	}
1603 
1604 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1605 		ASSERT(GET_REF(ptr) == 0);
1606 		(void) ibt_free_ud_dest(ptr->ac_dest);
1607 	}
1608 
1609 	list_destroy(&state->id_ah_free);
1610 	list_destroy(&state->id_ah_active);
1611 	list_destroy(&state->id_mc_full);
1612 	list_destroy(&state->id_mc_non);
1613 	list_destroy(&state->id_req_list);
1614 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1615 	mutex_exit(&state->id_ac_mutex);
1616 	mutex_destroy(&state->id_ac_mutex);
1617 	mutex_destroy(&state->id_mc_mutex);
1618 	mutex_destroy(&state->id_acache_req_lock);
1619 	cv_destroy(&state->id_acache_req_cv);
1620 }
1621 
1622 /*
1623  * Search AH active hash list for a cached path to input destination.
1624  * If we are "just looking", hold == F. When we are in the Tx path,
1625  * we set hold == T to grab a reference on the AH so that it can not
1626  * be recycled to a new destination while the Tx request is posted.
1627  */
1628 static ibd_ace_t *
1629 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1630 {
1631 	ibd_ace_t *ptr;
1632 
1633 	ASSERT(mutex_owned(&state->id_ac_mutex));
1634 
1635 	/*
1636 	 * Do hash search.
1637 	 */
1638 	if (mod_hash_find(state->id_ah_active_hash,
1639 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1640 		if (hold)
1641 			INC_REF(ptr, num);
1642 		return (ptr);
1643 	}
1644 	return (NULL);
1645 }
1646 
1647 /*
1648  * This is called by the tx side; if an initialized AH is found in
1649  * the active list, it is locked down and can be used; if no entry
1650  * is found, an async request is queued to do path resolution.
1651  */
1652 static ibd_ace_t *
1653 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1654 {
1655 	ibd_ace_t *ptr;
1656 
1657 	/*
1658 	 * Only attempt to print when we can; in the mdt pattr case, the
1659 	 * address is not aligned properly.
1660 	 */
1661 	if (((ulong_t)mac & 3) == 0)
1662 		DPRINT(4,
1663 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1664 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1665 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1666 		    htonl(mac->ipoib_gidsuff[1]));
1667 
1668 	mutex_enter(&state->id_ac_mutex);
1669 
1670 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1671 		mutex_exit(&state->id_ac_mutex);
1672 		return (ptr);
1673 	}
1674 
1675 	/*
1676 	 * Implementation of a single outstanding async request; if
1677 	 * the operation is not started yet, queue a request and move
1678 	 * to ongoing state. Remember in id_ah_addr for which address
1679 	 * we are queueing the request, in case we need to flag an error;
1680 	 * Any further requests, for the same or different address, until
1681 	 * the operation completes, is sent back to GLD to be retried.
1682 	 * The async thread will update id_ah_op with an error indication
1683 	 * or will set it to indicate the next look up can start; either
1684 	 * way, it will gld_sched() so that all blocked requests come
1685 	 * back here.
1686 	 */
1687 	*err = GLD_NORESOURCES;
1688 	if (state->id_ah_op == NOTSTARTED) {
1689 		/*
1690 		 * We did not even find the entry; queue a request for it.
1691 		 */
1692 		bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL);
1693 		ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH);
1694 		state->id_ah_op = ONGOING;
1695 		bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1696 	} else if ((state->id_ah_op != ONGOING) &&
1697 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1698 		/*
1699 		 * Check the status of the pathrecord lookup request
1700 		 * we had queued before.
1701 		 */
1702 		if (state->id_ah_op == ERRORED) {
1703 			*err = GLD_FAILURE;
1704 			state->id_ah_error++;
1705 		} else {
1706 			/*
1707 			 * ROUTERED case: We need to send to the
1708 			 * all-router MCG. If we can find the AH for
1709 			 * the mcg, the Tx will be attempted. If we
1710 			 * do not find the AH, we return NORESOURCES
1711 			 * to retry.
1712 			 */
1713 			ipoib_mac_t routermac;
1714 
1715 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1716 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1717 			    numwqe);
1718 		}
1719 		state->id_ah_op = NOTSTARTED;
1720 	}
1721 	mutex_exit(&state->id_ac_mutex);
1722 
1723 	/*
1724 	 * The PathRecord lookup failed; retry any other blocked
1725 	 * Tx requests that might have come in between when we
1726 	 * initiated the path lookup and now that were sent back
1727 	 * to GLD to implement single outstanding lookup scheme.
1728 	 */
1729 	if (*err == GLD_FAILURE)
1730 		gld_sched(state->id_macinfo);
1731 	return (ptr);
1732 }
1733 
1734 /*
1735  * Grab a not-currently-in-use AH/PathRecord from the active
1736  * list to recycle to a new destination. Only the async thread
1737  * executes this code.
1738  */
1739 static ibd_ace_t *
1740 ibd_acache_get_unref(ibd_state_t *state)
1741 {
1742 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1743 
1744 	ASSERT(mutex_owned(&state->id_ac_mutex));
1745 
1746 	/*
1747 	 * Do plain linear search.
1748 	 */
1749 	while (ptr != NULL) {
1750 		/*
1751 		 * Note that it is possible that the "cycle" bit
1752 		 * is set on the AH w/o any reference count. The
1753 		 * mcg must have been deleted, and the tx cleanup
1754 		 * just decremented the reference count to 0, but
1755 		 * hasn't gotten around to grabbing the id_ac_mutex
1756 		 * to move the AH into the free list.
1757 		 */
1758 		if (GET_REF(ptr) == 0) {
1759 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1760 			break;
1761 		}
1762 		ptr = list_next(&state->id_ah_active, ptr);
1763 	}
1764 	return (ptr);
1765 }
1766 
1767 /*
1768  * Invoked to clean up AH from active list in case of multicast
1769  * disable and to handle sendonly memberships during mcg traps.
1770  * And for port up processing for multicast and unicast AHs.
1771  * Normally, the AH is taken off the active list, and put into
1772  * the free list to be recycled for a new destination. In case
1773  * Tx requests on the AH have not completed yet, the AH is marked
1774  * for reaping (which will put the AH on the free list) once the Tx's
1775  * complete; in this case, depending on the "force" input, we take
1776  * out the AH from the active list right now, or leave it also for
1777  * the reap operation. Returns TRUE if the AH is taken off the active
1778  * list (and either put into the free list right now, or arranged for
1779  * later), FALSE otherwise.
1780  */
1781 static boolean_t
1782 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1783 {
1784 	ibd_ace_t *acactive;
1785 	boolean_t ret = B_TRUE;
1786 
1787 	ASSERT(mutex_owned(&state->id_ac_mutex));
1788 
1789 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1790 
1791 		/*
1792 		 * Note that the AH might already have the cycle bit set
1793 		 * on it; this might happen if sequences of multicast
1794 		 * enables and disables are coming so fast, that posted
1795 		 * Tx's to the mcg have not completed yet, and the cycle
1796 		 * bit is set successively by each multicast disable.
1797 		 */
1798 		if (SET_CYCLE_IF_REF(acactive)) {
1799 			if (!force) {
1800 				/*
1801 				 * The ace is kept on the active list, further
1802 				 * Tx's can still grab a reference on it; the
1803 				 * ace is reaped when all pending Tx's
1804 				 * referencing the AH complete.
1805 				 */
1806 				ret = B_FALSE;
1807 			} else {
1808 				/*
1809 				 * In the mcg trap case, we always pull the
1810 				 * AH from the active list. And also the port
1811 				 * up multi/unicast case.
1812 				 */
1813 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1814 				acactive->ac_mce = NULL;
1815 			}
1816 		} else {
1817 			/*
1818 			 * Determined the ref count is 0, thus reclaim
1819 			 * immediately after pulling out the ace from
1820 			 * the active list.
1821 			 */
1822 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1823 			acactive->ac_mce = NULL;
1824 			IBD_ACACHE_INSERT_FREE(state, acactive);
1825 		}
1826 
1827 	}
1828 	return (ret);
1829 }
1830 
1831 /*
1832  * Helper function for async path record lookup. If we are trying to
1833  * Tx to a MCG, check our membership, possibly trying to join the
1834  * group if required. If that fails, try to send the packet to the
1835  * all router group (indicated by the redirect output), pointing
1836  * the input mac address to the router mcg address.
1837  */
1838 static ibd_mce_t *
1839 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1840 {
1841 	ib_gid_t mgid;
1842 	ibd_mce_t *mce;
1843 	ipoib_mac_t routermac;
1844 
1845 	*redirect = B_FALSE;
1846 	ibd_n2h_gid(mac, &mgid);
1847 
1848 	/*
1849 	 * Check the FullMember+SendOnlyNonMember list.
1850 	 * Since we are the only one who manipulates the
1851 	 * id_mc_full list, no locks are needed.
1852 	 */
1853 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1854 	if (mce != NULL) {
1855 		DPRINT(4, "ibd_async_mcache : already joined to group");
1856 		return (mce);
1857 	}
1858 
1859 	/*
1860 	 * Not found; try to join(SendOnlyNonMember) and attach.
1861 	 */
1862 	DPRINT(4, "ibd_async_mcache : not joined to group");
1863 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1864 	    NULL) {
1865 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1866 		return (mce);
1867 	}
1868 
1869 	/*
1870 	 * MCGroup not present; try to join the all-router group. If
1871 	 * any of the following steps succeed, we will be redirecting
1872 	 * to the all router group.
1873 	 */
1874 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1875 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1876 		return (NULL);
1877 	*redirect = B_TRUE;
1878 	ibd_n2h_gid(&routermac, &mgid);
1879 	bcopy(&routermac, mac, IPOIB_ADDRL);
1880 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1881 	    mgid.gid_prefix, mgid.gid_guid);
1882 
1883 	/*
1884 	 * Are we already joined to the router group?
1885 	 */
1886 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1887 		DPRINT(4, "ibd_async_mcache : using already joined router"
1888 		    "group\n");
1889 		return (mce);
1890 	}
1891 
1892 	/*
1893 	 * Can we join(SendOnlyNonMember) the router group?
1894 	 */
1895 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1896 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1897 	    NULL) {
1898 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1899 		return (mce);
1900 	}
1901 
1902 	return (NULL);
1903 }
1904 
1905 /*
1906  * Async path record lookup code.
1907  */
1908 static void
1909 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1910 {
1911 	ibd_ace_t *ce;
1912 	ibd_mce_t *mce = NULL;
1913 	ibt_path_attr_t path_attr;
1914 	ibt_path_info_t path_info;
1915 	ib_gid_t destgid;
1916 	int ret = NOTSTARTED;
1917 
1918 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1919 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1920 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1921 	    htonl(mac->ipoib_gidsuff[1]));
1922 
1923 	/*
1924 	 * Check whether we are trying to transmit to a MCG.
1925 	 * In that case, we need to make sure we are a member of
1926 	 * the MCG.
1927 	 */
1928 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1929 		boolean_t redirected;
1930 
1931 		/*
1932 		 * If we can not find or join the group or even
1933 		 * redirect, error out.
1934 		 */
1935 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1936 		    NULL) {
1937 			state->id_ah_op = ERRORED;
1938 			return;
1939 		}
1940 
1941 		/*
1942 		 * If we got redirected, we need to determine whether
1943 		 * the AH for the new mcg is in the cache already, and
1944 		 * not pull it in then; otherwise proceed to get the
1945 		 * path for the new mcg. There is no guarantee that
1946 		 * if the AH is currently in the cache, it will still be
1947 		 * there when we look in ibd_acache_lookup(), but that's
1948 		 * okay, we will come back here.
1949 		 */
1950 		if (redirected) {
1951 			ret = ROUTERED;
1952 			DPRINT(4, "ibd_async_acache :  redirected to "
1953 			    "%08X:%08X:%08X:%08X:%08X",
1954 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1955 			    htonl(mac->ipoib_gidpref[1]),
1956 			    htonl(mac->ipoib_gidsuff[0]),
1957 			    htonl(mac->ipoib_gidsuff[1]));
1958 
1959 			mutex_enter(&state->id_ac_mutex);
1960 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1961 				mutex_exit(&state->id_ac_mutex);
1962 				DPRINT(4, "ibd_async_acache : router AH found");
1963 				state->id_ah_op = ROUTERED;
1964 				return;
1965 			}
1966 			mutex_exit(&state->id_ac_mutex);
1967 		}
1968 	}
1969 
1970 	/*
1971 	 * Get an AH from the free list.
1972 	 */
1973 	mutex_enter(&state->id_ac_mutex);
1974 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1975 		/*
1976 		 * No free ones; try to grab an unreferenced active
1977 		 * one. Maybe we need to make the active list LRU,
1978 		 * but that will create more work for Tx callbacks.
1979 		 * Is there a way of not having to pull out the
1980 		 * entry from the active list, but just indicate it
1981 		 * is being recycled? Yes, but that creates one more
1982 		 * check in the fast lookup path.
1983 		 */
1984 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1985 			/*
1986 			 * Pretty serious shortage now.
1987 			 */
1988 			state->id_ah_op = NOTSTARTED;
1989 			mutex_exit(&state->id_ac_mutex);
1990 			DPRINT(10, "ibd_async_acache : failed to find AH "
1991 			    "slot\n");
1992 			return;
1993 		}
1994 		/*
1995 		 * We could check whether ac_mce points to a SendOnly
1996 		 * member and drop that membership now. Or do it lazily
1997 		 * at detach time.
1998 		 */
1999 		ce->ac_mce = NULL;
2000 	}
2001 	mutex_exit(&state->id_ac_mutex);
2002 	ASSERT(ce->ac_mce == NULL);
2003 
2004 	/*
2005 	 * Update the entry.
2006 	 */
2007 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
2008 
2009 	bzero(&path_info, sizeof (path_info));
2010 	bzero(&path_attr, sizeof (ibt_path_attr_t));
2011 	path_attr.pa_sgid = state->id_sgid;
2012 	path_attr.pa_num_dgids = 1;
2013 	ibd_n2h_gid(&ce->ac_mac, &destgid);
2014 	path_attr.pa_dgids = &destgid;
2015 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2016 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2017 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
2018 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
2019 		goto error;
2020 	}
2021 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
2022 	    ntohl(ce->ac_mac.ipoib_qpn),
2023 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
2024 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
2025 		goto error;
2026 	}
2027 
2028 	/*
2029 	 * mce is set whenever an AH is being associated with a
2030 	 * MCG; this will come in handy when we leave the MCG. The
2031 	 * lock protects Tx fastpath from scanning the active list.
2032 	 */
2033 	if (mce != NULL)
2034 		ce->ac_mce = mce;
2035 	mutex_enter(&state->id_ac_mutex);
2036 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
2037 	state->id_ah_op = ret;
2038 	mutex_exit(&state->id_ac_mutex);
2039 	return;
2040 error:
2041 	/*
2042 	 * We might want to drop SendOnly membership here if we
2043 	 * joined above. The lock protects Tx callbacks inserting
2044 	 * into the free list.
2045 	 */
2046 	mutex_enter(&state->id_ac_mutex);
2047 	state->id_ah_op = ERRORED;
2048 	IBD_ACACHE_INSERT_FREE(state, ce);
2049 	mutex_exit(&state->id_ac_mutex);
2050 }
2051 
2052 /*
2053  * While restoring port's presence on the subnet on a port up, it is possible
2054  * that the port goes down again.
2055  */
2056 static void
2057 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
2058 {
2059 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
2060 	int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN :
2061 	    GLD_LINKSTATE_UP;
2062 	ibd_mce_t *mce, *pmce;
2063 	ibd_ace_t *ace, *pace;
2064 
2065 	DPRINT(10, "ibd_async_link(): %d", opcode);
2066 
2067 	/*
2068 	 * On a link up, revalidate the link speed/width. No point doing
2069 	 * this on a link down, since we will be unable to do SA operations,
2070 	 * defaulting to the lowest speed. Also notice that we update our
2071 	 * notion of speed before calling gld_linkstate(), which will do
2072 	 * neccesary higher level notifications for speed changes.
2073 	 */
2074 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
2075 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2076 		state->id_link_speed = ibd_get_portspeed(state);
2077 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2078 	}
2079 
2080 	/*
2081 	 * Do all the work required to establish our presence on
2082 	 * the subnet.
2083 	 */
2084 	if (opcode == IBD_LINK_UP_ABSENT) {
2085 		/*
2086 		 * If in promiscuous mode ...
2087 		 */
2088 		if (state->id_prom_op == COMPLETED) {
2089 			/*
2090 			 * Drop all nonmembership.
2091 			 */
2092 			ibd_async_unsetprom(state, B_FALSE);
2093 
2094 			/*
2095 			 * Then, try to regain nonmembership to all mcg's.
2096 			 */
2097 			ibd_async_setprom(state, B_FALSE);
2098 
2099 		}
2100 
2101 		/*
2102 		 * Drop all sendonly membership (which also gets rid of the
2103 		 * AHs); try to reacquire all full membership.
2104 		 */
2105 		mce = list_head(&state->id_mc_full);
2106 		while ((pmce = mce) != NULL) {
2107 			mce = list_next(&state->id_mc_full, mce);
2108 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
2109 				ibd_leave_group(state,
2110 				    pmce->mc_info.mc_adds_vect.av_dgid,
2111 				    IB_MC_JSTATE_SEND_ONLY_NON);
2112 			else
2113 				ibd_reacquire_group(state, pmce);
2114 		}
2115 
2116 		/*
2117 		 * Recycle all active AHs to free list (and if there are
2118 		 * pending posts, make sure they will go into the free list
2119 		 * once the Tx's complete). Grab the lock to prevent
2120 		 * concurrent Tx's as well as Tx cleanups.
2121 		 */
2122 		mutex_enter(&state->id_ac_mutex);
2123 		ace = list_head(&state->id_ah_active);
2124 		while ((pace = ace) != NULL) {
2125 			boolean_t cycled;
2126 
2127 			ace = list_next(&state->id_ah_active, ace);
2128 			mce = pace->ac_mce;
2129 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
2130 			    B_TRUE);
2131 			/*
2132 			 * If this is for an mcg, it must be for a fullmember,
2133 			 * since we got rid of send-only members above when
2134 			 * processing the mce list.
2135 			 */
2136 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2137 			    IB_MC_JSTATE_FULL)));
2138 
2139 			/*
2140 			 * Check if the fullmember mce needs to be torn down,
2141 			 * ie whether the DLPI disable has already been done.
2142 			 * If so, do some of the work of tx_cleanup, namely
2143 			 * causing leave (which will fail), detach and
2144 			 * mce-freeing. tx_cleanup will put the AH into free
2145 			 * list. The reason to duplicate some of this
2146 			 * tx_cleanup work is because we want to delete the
2147 			 * AH right now instead of waiting for tx_cleanup, to
2148 			 * force subsequent Tx's to reacquire an AH.
2149 			 */
2150 			if ((mce != NULL) && (mce->mc_fullreap))
2151 				ibd_async_reap_group(state, mce,
2152 				    mce->mc_info.mc_adds_vect.av_dgid,
2153 				    mce->mc_jstate);
2154 		}
2155 		mutex_exit(&state->id_ac_mutex);
2156 	}
2157 
2158 	/*
2159 	 * Macinfo is guaranteed to exist since driver does ibt_close_hca()
2160 	 * (which stops further events from being delivered) before
2161 	 * gld_mac_free(). At this point, it is guaranteed that gld_register
2162 	 * has already been done.
2163 	 */
2164 	mutex_enter(&state->id_link_mutex);
2165 	state->id_link_state = lstate;
2166 	gld_linkstate(state->id_macinfo, lstate);
2167 	mutex_exit(&state->id_link_mutex);
2168 
2169 	/*
2170 	 * Free the request slot allocated by the event thread.
2171 	 */
2172 	kmem_free(req, sizeof (ibd_req_t));
2173 
2174 	ibd_async_done(state);
2175 }
2176 
2177 /*
2178  * When the link is notified up, we need to do a few things, based
2179  * on the port's current p_init_type_reply claiming a reinit has been
2180  * done or not. The reinit steps are:
2181  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2182  *    the old Pkey and GID0 are correct.
2183  * 2. Register for mcg traps (already done by ibmf).
2184  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2185  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2186  * 4. Give up all sendonly memberships.
2187  * 5. Acquire all full memberships.
2188  * 6. In promiscuous mode, acquire all non memberships.
2189  * 7. Recycle all AHs to free list.
2190  */
2191 static void
2192 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2193 {
2194 	ibt_hca_portinfo_t *port_infop;
2195 	ibt_status_t ibt_status;
2196 	uint_t psize, port_infosz;
2197 	ibd_link_op_t opcode;
2198 	ibd_req_t *req;
2199 
2200 	/*
2201 	 * Do not send a request to the async daemon if it has not
2202 	 * yet been created or is being destroyed. If the async
2203 	 * daemon has not yet been created, we still need to track
2204 	 * last known state of the link. If this code races with the
2205 	 * detach path, then we are assured that the detach path has
2206 	 * not yet done the ibt_close_hca (which waits for all async
2207 	 * events to complete). If the code races with the attach path,
2208 	 * we need to validate the pkey/gid (in the link_up case) if
2209 	 * the initialization path has already set these up and created
2210 	 * IBTF resources based on the values.
2211 	 */
2212 	mutex_enter(&state->id_link_mutex);
2213 
2214 	/*
2215 	 * If the init code in ibd_drv_init hasn't yet set up the
2216 	 * pkey/gid, nothing to do; that code will set the link state.
2217 	 */
2218 	if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) {
2219 		mutex_exit(&state->id_link_mutex);
2220 		return;
2221 	}
2222 
2223 	if (code == IBT_EVENT_PORT_UP) {
2224 		uint8_t itreply;
2225 		boolean_t badup = B_FALSE;
2226 
2227 		ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
2228 		    state->id_port, &port_infop, &psize, &port_infosz);
2229 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
2230 			mutex_exit(&state->id_link_mutex);
2231 			DPRINT(10, "ibd_link_up : failed in"
2232 			    " ibt_query_port()\n");
2233 			return;
2234 		}
2235 
2236 		/*
2237 		 * If the link already went down by the time the handler gets
2238 		 * here, give up; we can not even validate pkey/gid since those
2239 		 * are not valid.
2240 		 */
2241 		if (port_infop->p_linkstate != IBT_PORT_ACTIVE)
2242 			badup = B_TRUE;
2243 
2244 		itreply = port_infop->p_init_type_reply;
2245 
2246 		/*
2247 		 * In InitTypeReply, check if NoLoadReply ==
2248 		 * PreserveContentReply == 0, in which case, verify Pkey/GID0.
2249 		 */
2250 		if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2251 		    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
2252 		    (!badup)) {
2253 			/*
2254 			 * Check that the subnet part of GID0 has not changed.
2255 			 */
2256 			if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid,
2257 			    sizeof (ib_gid_t)) != 0)
2258 				badup = B_TRUE;
2259 
2260 			/*
2261 			 * Check that Pkey/index mapping is still valid.
2262 			 */
2263 			if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) ||
2264 			    (port_infop->p_pkey_tbl[state->id_pkix] !=
2265 			    state->id_pkey))
2266 				badup = B_TRUE;
2267 		}
2268 
2269 		/*
2270 		 * In InitTypeReply, if PreservePresenceReply indicates the SM
2271 		 * has ensured that the port's presence in mcg, traps etc is
2272 		 * intact, nothing more to do.
2273 		 */
2274 		opcode = IBD_LINK_UP_ABSENT;
2275 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2276 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY)
2277 			opcode = IBD_LINK_UP;
2278 
2279 		if (badup)
2280 			code = IBT_ERROR_PORT_DOWN;
2281 		ibt_free_portinfo(port_infop, port_infosz);
2282 	}
2283 
2284 	if (!ibd_async_safe(state)) {
2285 		state->id_link_state = ((code == IBT_EVENT_PORT_UP) ?
2286 		    GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN);
2287 		mutex_exit(&state->id_link_mutex);
2288 		return;
2289 	}
2290 	mutex_exit(&state->id_link_mutex);
2291 
2292 	if (code == IBT_ERROR_PORT_DOWN)
2293 		opcode = IBD_LINK_DOWN;
2294 
2295 	req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP);
2296 	req->rq_ptr = (void *)opcode;
2297 	ibd_queue_work_slot(state, req, ASYNC_LINK);
2298 }
2299 
2300 /*
2301  * For the port up/down events, IBTL guarantees there will not be concurrent
2302  * invocations of the handler. IBTL might coalesce link transition events,
2303  * and not invoke the handler for _each_ up/down transition, but it will
2304  * invoke the handler with last known state
2305  */
2306 static void
2307 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2308     ibt_async_code_t code, ibt_async_event_t *event)
2309 {
2310 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2311 
2312 	switch (code) {
2313 	case IBT_ERROR_CATASTROPHIC_CHAN:
2314 		ibd_print_warn(state, "catastrophic channel error");
2315 		break;
2316 	case IBT_ERROR_CQ:
2317 		ibd_print_warn(state, "completion queue error");
2318 		break;
2319 	case IBT_ERROR_PORT_DOWN:
2320 	case IBT_EVENT_PORT_UP:
2321 		/*
2322 		 * Events will be delivered to all instances that have
2323 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2324 		 * Only need to do work for our port; IBTF will deliver
2325 		 * events for other ports on the hca we have ibt_open_hca'ed
2326 		 * too. Note that ibd_drv_init() initializes id_port before
2327 		 * doing ibt_open_hca().
2328 		 */
2329 		ASSERT(state->id_hca_hdl == hca_hdl);
2330 		if (state->id_port != event->ev_port)
2331 			break;
2332 
2333 		ibd_link_mod(state, code);
2334 		break;
2335 
2336 	case IBT_HCA_ATTACH_EVENT:
2337 	case IBT_HCA_DETACH_EVENT:
2338 		/*
2339 		 * When a new card is plugged to the system, attach_event is
2340 		 * invoked. Additionally, a cfgadm needs to be run to make the
2341 		 * card known to the system, and an ifconfig needs to be run to
2342 		 * plumb up any ibd interfaces on the card. In the case of card
2343 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2344 		 * unplumb the ibd interfaces on the card; when the card is
2345 		 * actually unplugged, the detach_event is invoked;
2346 		 * additionally, if any ibd instances are still active on the
2347 		 * card (eg there were no associated RCM scripts), driver's
2348 		 * detach routine is invoked.
2349 		 */
2350 		break;
2351 	default:
2352 		break;
2353 	}
2354 }
2355 
2356 /*
2357  * Attach device to the IO framework.
2358  */
2359 static int
2360 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2361 {
2362 	ibd_state_t *state;
2363 	int instance;
2364 
2365 	switch (cmd) {
2366 		case DDI_ATTACH:
2367 			break;
2368 		case DDI_RESUME:
2369 			/* This driver does not support resume */
2370 		default:
2371 			return (DDI_FAILURE);
2372 	}
2373 
2374 	/*
2375 	 * Allocate soft device data structure
2376 	 */
2377 	instance = ddi_get_instance(dip);
2378 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2379 		return (DDI_FAILURE);
2380 	state = ddi_get_soft_state(ibd_list, instance);
2381 
2382 	/* pre ibt_attach() soft state initialization */
2383 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2384 		DPRINT(10, "ibd_attach : failed in ibd_state_init()");
2385 		goto attach_fail_state_init;
2386 	}
2387 
2388 	/* "attach" to IBTL */
2389 	if (ibt_attach(&ibd_clnt_modinfo, dip, state,
2390 	    &state->id_ibt_hdl) != IBT_SUCCESS) {
2391 		DPRINT(10, "ibd_attach : failed in ibt_attach()");
2392 		goto attach_fail_ibt_attach;
2393 	}
2394 
2395 	/* Finish initializing this driver */
2396 	if (ibd_drv_init(state) != DDI_SUCCESS) {
2397 		DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n");
2398 		goto attach_fail_drv_init;
2399 	}
2400 
2401 	/*
2402 	 *  Register ourselves with the GLD interface
2403 	 *
2404 	 *  gld_register will:
2405 	 *	link us with the GLD module;
2406 	 *	set our ddi_set_driver_private(9F) data to the macinfo ptr;
2407 	 *	save the devinfo pointer in macinfo->gldm_devinfo;
2408 	 *	create the minor device node.
2409 	 */
2410 	if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) {
2411 		DPRINT(10, "ibd_attach : failed in gld_register()");
2412 		goto attach_fail_gld_register;
2413 	}
2414 
2415 	/*
2416 	 * Setup the handler we will use for regular DLPI stuff. Its important
2417 	 * to setup the recv handler after registering with gld. Setting it
2418 	 * before causes at times an incoming packet to be forwarded to gld
2419 	 * before the gld_register. This will result in gld dropping the packet
2420 	 * which is ignored by ibd_rcq_handler, thus failing to re-arm the
2421 	 * tavor events. This will cause tavor_isr on recv path to be not
2422 	 * invoked any further.
2423 	 */
2424 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2425 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
2426 	    IBT_SUCCESS) {
2427 		DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n");
2428 		goto attach_fail_gld_register;
2429 	}
2430 
2431 	/*
2432 	 * Setup the subnet notices handler after we initialize the a/mcaches
2433 	 * and start the async thread, both of which are required for the
2434 	 * trap handler to function properly. Enable the trap handler to
2435 	 * queue requests to the async thread after the gld_register, because
2436 	 * the async daemon invokes gld_sched(), which must be done after
2437 	 * gld_register().
2438 	 */
2439 	ibt_register_subnet_notices(state->id_ibt_hdl,
2440 	    ibd_snet_notices_handler, state);
2441 	mutex_enter(&state->id_trap_lock);
2442 	state->id_trap_stop = B_FALSE;
2443 	mutex_exit(&state->id_trap_lock);
2444 
2445 	/*
2446 	 * Indicate link status to GLD and higher layers. By default,
2447 	 * we assume we are in up state (which must have been true at
2448 	 * least at the time the broadcast mcg's were probed); if there
2449 	 * were any up/down transitions till the time we come here, the
2450 	 * async handler will have updated last known state, which we
2451 	 * use to tell GLD. The async handler will not send any
2452 	 * notifications to GLD till we reach here in the initialization
2453 	 * sequence.
2454 	 */
2455 	mutex_enter(&state->id_link_mutex);
2456 	gld_linkstate(state->id_macinfo, state->id_link_state);
2457 	mutex_exit(&state->id_link_mutex);
2458 
2459 	return (DDI_SUCCESS);
2460 
2461 	/* Attach failure points, cleanup */
2462 attach_fail_gld_register:
2463 	ibd_drv_fini(state);
2464 
2465 attach_fail_drv_init:
2466 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2467 		ibd_print_warn(state, "failed to free IB resources");
2468 
2469 attach_fail_ibt_attach:
2470 	ibd_state_fini(state);
2471 
2472 attach_fail_state_init:
2473 	ddi_soft_state_free(ibd_list, instance);
2474 
2475 	return (DDI_FAILURE);
2476 }
2477 
2478 /*
2479  * Detach device from the IO framework.
2480  */
2481 static int
2482 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2483 {
2484 	ibd_state_t *state;
2485 	int status;
2486 	int instance;
2487 
2488 	switch (cmd) {
2489 		case DDI_DETACH:
2490 			break;
2491 		case DDI_SUSPEND:
2492 		default:
2493 			return (DDI_FAILURE);
2494 	}
2495 
2496 	instance = ddi_get_instance(dip);
2497 	state = ddi_get_soft_state(ibd_list, instance);
2498 
2499 	/*
2500 	 * First, stop receive interrupts; this stops the
2501 	 * driver from handing up buffers to higher layers.
2502 	 * Wait for receive buffers to be returned; give up
2503 	 * after 5 seconds.
2504 	 */
2505 	ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
2506 	status = 50;
2507 	while (state->id_rx_list.dl_bufs_outstanding > 0) {
2508 		delay(drv_usectohz(100000));
2509 		if (--status == 0) {
2510 			DPRINT(2, "ibd_detach : reclaiming failed");
2511 			goto failed;
2512 		}
2513 	}
2514 
2515 	if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) {
2516 		DPRINT(10, "ibd_detach : failed in gld_unregister()");
2517 		goto failed;
2518 	}
2519 
2520 	ibd_drv_fini(state);
2521 
2522 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2523 		ibd_print_warn(state, "failed to free all IB resources at "
2524 		    "driver detach time");
2525 
2526 	ibd_state_fini(state);
2527 	ddi_soft_state_free(ibd_list, instance);
2528 	return (DDI_SUCCESS);
2529 
2530 failed:
2531 	/*
2532 	 * Reap all the Tx/Rx completions that were posted since we
2533 	 * turned off the notification. Turn on notifications. There
2534 	 * is a race in that we do not reap completions that come in
2535 	 * after the poll and before notifications get turned on. That
2536 	 * is okay, the next rx/tx packet will trigger a completion
2537 	 * that will reap any missed completions.
2538 	 */
2539 	ibd_poll_compq(state, state->id_rcq_hdl);
2540 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2541 	return (DDI_FAILURE);
2542 }
2543 
2544 /*
2545  * Pre ibt_attach() driver initialization
2546  */
2547 static int
2548 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2549 {
2550 	gld_mac_info_t *macinfo;
2551 
2552 	if ((macinfo = gld_mac_alloc(dip)) == NULL) {
2553 		DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()");
2554 		return (DDI_FAILURE);
2555 	}
2556 
2557 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2558 	state->id_link_state = GLD_LINKSTATE_UNKNOWN;
2559 
2560 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2561 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2562 	state->id_trap_stop = B_TRUE;
2563 	state->id_trap_inprog = 0;
2564 
2565 	mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL);
2566 	state->id_dip = dip;
2567 	state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP);
2568 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP);
2569 
2570 	state->id_sched_queued = B_FALSE;
2571 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2572 
2573 	state->id_tx_list.dl_head = NULL;
2574 	state->id_tx_list.dl_tail = NULL;
2575 	state->id_tx_list.dl_pending_sends = B_FALSE;
2576 	state->id_tx_list.dl_cnt = 0;
2577 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2578 
2579 	state->id_rx_list.dl_head = NULL;
2580 	state->id_rx_list.dl_tail = NULL;
2581 	state->id_rx_list.dl_bufs_outstanding = 0;
2582 	state->id_rx_list.dl_cnt = 0;
2583 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2584 
2585 	/* Link up various structs for later access */
2586 	macinfo->gldm_private = (caddr_t)state;
2587 	state->id_macinfo = macinfo;
2588 
2589 	/*
2590 	 * Initialize pointers to device specific functions which will be
2591 	 * used by the generic layer.
2592 	 */
2593 	macinfo->gldm_reset = ibd_reset;
2594 	macinfo->gldm_start = ibd_start;
2595 	macinfo->gldm_stop = ibd_stop;
2596 	macinfo->gldm_set_mac_addr = ibd_set_mac_addr;
2597 	macinfo->gldm_set_multicast = ibd_set_multicast;
2598 	macinfo->gldm_set_promiscuous = ibd_set_promiscuous;
2599 	macinfo->gldm_get_stats = ibd_get_stats;
2600 	macinfo->gldm_send = ibd_send;
2601 	macinfo->gldm_intr = ibd_intr;
2602 	macinfo->gldm_mdt_pre = ibd_mdt_pre;
2603 	macinfo->gldm_mdt_send = ibd_mdt_txone;
2604 	macinfo->gldm_mdt_post = ibd_mdt_post;
2605 	macinfo->gldm_mdt_sgl = state->id_max_sqseg;
2606 	macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS;
2607 
2608 	/* Initialize board characteristics needed by the generic layer. */
2609 	macinfo->gldm_ident = "InfiniBand DLPI Driver";
2610 	macinfo->gldm_type = DL_IB;
2611 	macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */
2612 	macinfo->gldm_addrlen = IPOIB_ADDRL;
2613 	macinfo->gldm_saplen = -2;
2614 	macinfo->gldm_capabilities = GLD_CAP_LINKSTATE;
2615 
2616 	/* Other required initialization */
2617 	macinfo->gldm_ppa = ddi_get_instance(dip);
2618 	macinfo->gldm_devinfo = dip;
2619 
2620 	return (DDI_SUCCESS);
2621 }
2622 
2623 /*
2624  * Post ibt_detach() driver deconstruction
2625  */
2626 static void
2627 ibd_state_fini(ibd_state_t *state)
2628 {
2629 	mutex_destroy(&state->id_tx_list.dl_mutex);
2630 	mutex_destroy(&state->id_rx_list.dl_mutex);
2631 	mutex_destroy(&state->id_sched_lock);
2632 	mutex_destroy(&state->id_txcomp_lock);
2633 	kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE);
2634 	kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE);
2635 	cv_destroy(&state->id_trap_cv);
2636 	mutex_destroy(&state->id_trap_lock);
2637 	mutex_destroy(&state->id_link_mutex);
2638 	gld_mac_free(state->id_macinfo);
2639 }
2640 
2641 /*
2642  * Fetch IBA parameters for the network device from IB nexus.
2643  */
2644 static int
2645 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid)
2646 {
2647 	/*
2648 	 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
2649 	 * Note that the default partition is also allowed.
2650 	 */
2651 	state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2652 	    0, "port-pkey", IB_PKEY_INVALID_LIMITED);
2653 	if (state->id_pkey <= IB_PKEY_INVALID_FULL) {
2654 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong"
2655 		    "partition\n");
2656 		return (DDI_FAILURE);
2657 	}
2658 
2659 	/*
2660 	 * ... the IBA port ...
2661 	 */
2662 	state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2663 	    0, "port-number", 0);
2664 	if (state->id_port == 0) {
2665 		DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n");
2666 		return (DDI_FAILURE);
2667 	}
2668 
2669 	/*
2670 	 * ... and HCA GUID.
2671 	 */
2672 	*hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
2673 	    0, "hca-guid", 0);
2674 	if (*hca_guid == 0) {
2675 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong "
2676 		    "guid\n");
2677 		return (DDI_FAILURE);
2678 	}
2679 
2680 	return (DDI_SUCCESS);
2681 }
2682 
2683 /*
2684  * Fetch link speed from SA for snmp ifspeed reporting.
2685  */
2686 static uint64_t
2687 ibd_get_portspeed(ibd_state_t *state)
2688 {
2689 	int			ret;
2690 	uint64_t		ifspeed;
2691 	size_t			length;
2692 	ib_lid_t		lid;
2693 	sa_portinfo_record_t	req, *resp = NULL;
2694 	ibmf_saa_access_args_t	args;
2695 	ibmf_saa_handle_t	saa_handle;
2696 
2697 	/*
2698 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2699 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2700 	 * 2000000000. Start with that as default.
2701 	 */
2702 	ifspeed = 2000000000;
2703 
2704 	/* Get port lid */
2705 	if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL,
2706 	    &lid) != IBT_SUCCESS)
2707 		goto earlydone;
2708 
2709 	if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL,
2710 	    IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS)
2711 		goto earlydone;
2712 
2713 	/* Contact SA Access */
2714 	bzero(&req, sizeof (sa_portinfo_record_t));
2715 	req.EndportLID = lid;
2716 
2717 	args.sq_attr_id		= SA_PORTINFORECORD_ATTRID;
2718 	args.sq_access_type	= IBMF_SAA_RETRIEVE;
2719 	args.sq_component_mask	= SA_PORTINFO_COMPMASK_PORTLID;
2720 	args.sq_template	= &req;
2721 	args.sq_callback	= NULL;
2722 	args.sq_callback_arg	= NULL;
2723 
2724 	ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp);
2725 	if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL))
2726 		goto done;
2727 
2728 	/*
2729 	 * 4X/12X needs appropriate multipliers. With IBA 1.2 additions,
2730 	 * double and quad multipliers are also needed per LinkSpeedEnabled.
2731 	 * In case SA does not return an expected value, report the default
2732 	 * speed as 1X.
2733 	 */
2734 	ret = 1;
2735 	switch (resp->PortInfo.LinkWidthActive) {
2736 		case SM_LINK_WIDTH_ACTIVE_1X:
2737 			ret = 1;
2738 			break;
2739 		case SM_LINK_WIDTH_ACTIVE_4X:
2740 			ret = 4;
2741 			break;
2742 		case SM_LINK_WIDTH_ACTIVE_12X:
2743 			ret = 12;
2744 			break;
2745 	}
2746 	ifspeed *= ret;
2747 	kmem_free(resp, length);
2748 
2749 done:
2750 	(void) ibmf_sa_session_close(&saa_handle, 0);
2751 
2752 earlydone:
2753 	return (ifspeed);
2754 }
2755 
2756 /*
2757  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2758  * representing the input mcg mgid.
2759  */
2760 static ibd_mce_t *
2761 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2762 {
2763 	ibd_mce_t *ptr = list_head(mlist);
2764 
2765 	/*
2766 	 * Do plain linear search.
2767 	 */
2768 	while (ptr != NULL) {
2769 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2770 		    sizeof (ib_gid_t)) == 0)
2771 			return (ptr);
2772 		ptr = list_next(mlist, ptr);
2773 	}
2774 	return (NULL);
2775 }
2776 
2777 /*
2778  * Execute IBA JOIN.
2779  */
2780 static ibt_status_t
2781 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2782 {
2783 	ibt_mcg_attr_t mcg_attr;
2784 
2785 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2786 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2787 	mcg_attr.mc_mgid = mgid;
2788 	mcg_attr.mc_join_state = mce->mc_jstate;
2789 	mcg_attr.mc_scope = state->id_scope;
2790 	mcg_attr.mc_pkey = state->id_pkey;
2791 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2792 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2793 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2794 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2795 	    NULL, NULL));
2796 }
2797 
2798 /*
2799  * This code JOINs the port in the proper way (depending on the join
2800  * state) so that IBA fabric will forward mcg packets to/from the port.
2801  * It also attaches the QPN to the mcg so it can receive those mcg
2802  * packets. This code makes sure not to attach the mcg to the QP if
2803  * that has been previously done due to the mcg being joined with a
2804  * different join state, even though this is not required by SWG_0216,
2805  * refid 3610.
2806  */
2807 static ibd_mce_t *
2808 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2809 {
2810 	ibt_status_t ibt_status;
2811 	ibd_mce_t *mce, *tmce, *omce = NULL;
2812 	boolean_t do_attach = B_TRUE;
2813 
2814 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2815 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2816 
2817 	/*
2818 	 * For enable_multicast Full member joins, we need to do some
2819 	 * extra work. If there is already an mce on the list that
2820 	 * indicates full membership, that means the membership has
2821 	 * not yet been dropped (since the disable_multicast was issued)
2822 	 * because there are pending Tx's to the mcg; in that case, just
2823 	 * mark the mce not to be reaped when the Tx completion queues
2824 	 * an async reap operation.
2825 	 *
2826 	 * If there is already an mce on the list indicating sendonly
2827 	 * membership, try to promote to full membership. Be careful
2828 	 * not to deallocate the old mce, since there might be an AH
2829 	 * pointing to it; instead, update the old mce with new data
2830 	 * that tracks the full membership.
2831 	 */
2832 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2833 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2834 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2835 			ASSERT(omce->mc_fullreap);
2836 			omce->mc_fullreap = B_FALSE;
2837 			return (omce);
2838 		} else {
2839 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2840 		}
2841 	}
2842 
2843 	/*
2844 	 * Allocate the ibd_mce_t to track this JOIN.
2845 	 */
2846 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2847 	mce->mc_fullreap = B_FALSE;
2848 	mce->mc_jstate = jstate;
2849 
2850 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2851 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2852 		    ibt_status);
2853 		kmem_free(mce, sizeof (ibd_mce_t));
2854 		return (NULL);
2855 	}
2856 
2857 	/*
2858 	 * Is an IBA attach required? Not if the interface is already joined
2859 	 * to the mcg in a different appropriate join state.
2860 	 */
2861 	if (jstate == IB_MC_JSTATE_NON) {
2862 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2863 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2864 			do_attach = B_FALSE;
2865 	} else if (jstate == IB_MC_JSTATE_FULL) {
2866 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2867 			do_attach = B_FALSE;
2868 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2869 		do_attach = B_FALSE;
2870 	}
2871 
2872 	if (do_attach) {
2873 		/*
2874 		 * Do the IBA attach.
2875 		 */
2876 		DPRINT(10, "ibd_join_group : ibt_attach_mcg \n");
2877 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2878 		    &mce->mc_info)) != IBT_SUCCESS) {
2879 			DPRINT(10, "ibd_join_group : failed qp attachment "
2880 			    "%d\n", ibt_status);
2881 			/*
2882 			 * NOTE that we should probably preserve the join info
2883 			 * in the list and later try to leave again at detach
2884 			 * time.
2885 			 */
2886 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2887 			    state->id_sgid, jstate);
2888 			kmem_free(mce, sizeof (ibd_mce_t));
2889 			return (NULL);
2890 		}
2891 	}
2892 
2893 	/*
2894 	 * Insert the ibd_mce_t in the proper list.
2895 	 */
2896 	if (jstate == IB_MC_JSTATE_NON) {
2897 		IBD_MCACHE_INSERT_NON(state, mce);
2898 	} else {
2899 		/*
2900 		 * Set up the mc_req fields used for reaping the
2901 		 * mcg in case of delayed tx completion (see
2902 		 * ibd_tx_cleanup()). Also done for sendonly join in
2903 		 * case we are promoted to fullmembership later and
2904 		 * keep using the same mce.
2905 		 */
2906 		mce->mc_req.rq_gid = mgid;
2907 		mce->mc_req.rq_ptr = mce;
2908 		/*
2909 		 * Check whether this is the case of trying to join
2910 		 * full member, and we were already joined send only.
2911 		 * We try to drop our SendOnly membership, but it is
2912 		 * possible that the mcg does not exist anymore (and
2913 		 * the subnet trap never reached us), so the leave
2914 		 * operation might fail.
2915 		 */
2916 		if (omce != NULL) {
2917 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2918 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2919 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2920 			bcopy(&mce->mc_info, &omce->mc_info,
2921 			    sizeof (ibt_mcg_info_t));
2922 			kmem_free(mce, sizeof (ibd_mce_t));
2923 			return (omce);
2924 		}
2925 		mutex_enter(&state->id_mc_mutex);
2926 		IBD_MCACHE_INSERT_FULL(state, mce);
2927 		mutex_exit(&state->id_mc_mutex);
2928 	}
2929 
2930 	return (mce);
2931 }
2932 
2933 /*
2934  * Called during port up event handling to attempt to reacquire full
2935  * membership to an mcg. Stripped down version of ibd_join_group().
2936  * Note that it is possible that the mcg might have gone away, and
2937  * gets recreated at this point.
2938  */
2939 static void
2940 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2941 {
2942 	ib_gid_t mgid;
2943 
2944 	/*
2945 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2946 	 * reap/leave is going to try to leave the group. We could prevent
2947 	 * that by adding a boolean flag into ibd_mce_t, if required.
2948 	 */
2949 	if (mce->mc_fullreap)
2950 		return;
2951 
2952 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2953 
2954 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2955 	    mgid.gid_guid);
2956 
2957 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2958 		ibd_print_warn(state, "Failure on port up to rejoin "
2959 		    "multicast gid %016llx:%016llx",
2960 		    (u_longlong_t)mgid.gid_prefix,
2961 		    (u_longlong_t)mgid.gid_guid);
2962 }
2963 
2964 /*
2965  * This code handles delayed Tx completion cleanups for mcg's to which
2966  * disable_multicast has been issued, regular mcg related cleanups during
2967  * disable_multicast, disable_promiscous and mcg traps, as well as
2968  * cleanups during driver detach time. Depending on the join state,
2969  * it deletes the mce from the appropriate list and issues the IBA
2970  * leave/detach; except in the disable_multicast case when the mce
2971  * is left on the active list for a subsequent Tx completion cleanup.
2972  */
2973 static void
2974 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2975     uint8_t jstate)
2976 {
2977 	ibd_mce_t *tmce;
2978 	boolean_t do_detach = B_TRUE;
2979 
2980 	/*
2981 	 * Before detaching, we must check whether the other list
2982 	 * contains the mcg; if we detach blindly, the consumer
2983 	 * who set up the other list will also stop receiving
2984 	 * traffic.
2985 	 */
2986 	if (jstate == IB_MC_JSTATE_FULL) {
2987 		/*
2988 		 * The following check is only relevant while coming
2989 		 * from the Tx completion path in the reap case.
2990 		 */
2991 		if (!mce->mc_fullreap)
2992 			return;
2993 		mutex_enter(&state->id_mc_mutex);
2994 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2995 		mutex_exit(&state->id_mc_mutex);
2996 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2997 			do_detach = B_FALSE;
2998 	} else if (jstate == IB_MC_JSTATE_NON) {
2999 		IBD_MCACHE_PULLOUT_NON(state, mce);
3000 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3001 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3002 			do_detach = B_FALSE;
3003 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3004 		mutex_enter(&state->id_mc_mutex);
3005 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3006 		mutex_exit(&state->id_mc_mutex);
3007 		do_detach = B_FALSE;
3008 	}
3009 
3010 	/*
3011 	 * If we are reacting to a mcg trap and leaving our sendonly or
3012 	 * non membership, the mcg is possibly already gone, so attempting
3013 	 * to leave might fail. On the other hand, we must try to leave
3014 	 * anyway, since this might be a trap from long ago, and we could
3015 	 * have potentially sendonly joined to a recent incarnation of
3016 	 * the mcg and are about to loose track of this information.
3017 	 */
3018 	if (do_detach) {
3019 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3020 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3021 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3022 	}
3023 
3024 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3025 	kmem_free(mce, sizeof (ibd_mce_t));
3026 }
3027 
3028 /*
3029  * Async code executed due to multicast and promiscuous disable requests
3030  * and mcg trap handling; also executed during driver detach. Mostly, a
3031  * leave and detach is done; except for the fullmember case when Tx
3032  * requests are pending, whence arrangements are made for subsequent
3033  * cleanup on Tx completion.
3034  */
3035 static void
3036 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3037 {
3038 	ipoib_mac_t mcmac;
3039 	boolean_t recycled;
3040 	ibd_mce_t *mce;
3041 
3042 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3043 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3044 
3045 	if (jstate == IB_MC_JSTATE_NON) {
3046 		recycled = B_TRUE;
3047 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3048 		/*
3049 		 * In case we are handling a mcg trap, we might not find
3050 		 * the mcg in the non list.
3051 		 */
3052 		if (mce == NULL)
3053 			return;
3054 	} else {
3055 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3056 
3057 		/*
3058 		 * In case we are handling a mcg trap, make sure the trap
3059 		 * is not arriving late; if we have an mce that indicates
3060 		 * that we are already a fullmember, that would be a clear
3061 		 * indication that the trap arrived late (ie, is for a
3062 		 * previous incarnation of the mcg).
3063 		 */
3064 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3065 			if ((mce == NULL) || (mce->mc_jstate ==
3066 			    IB_MC_JSTATE_FULL))
3067 				return;
3068 			ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3069 		} else {
3070 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3071 			ASSERT((mce != NULL) && (mce->mc_jstate ==
3072 			    IB_MC_JSTATE_FULL));
3073 			mce->mc_fullreap = B_TRUE;
3074 		}
3075 
3076 		/*
3077 		 * If no pending Tx's remain that reference the AH
3078 		 * for the mcg, recycle it from active to free list.
3079 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3080 		 * so the last completing Tx will cause an async reap
3081 		 * operation to be invoked, at which time we will drop our
3082 		 * membership to the mcg so that the pending Tx's complete
3083 		 * successfully. Refer to comments on "AH and MCE active
3084 		 * list manipulation" at top of this file. The lock protects
3085 		 * against Tx fast path and Tx cleanup code.
3086 		 */
3087 		mutex_enter(&state->id_ac_mutex);
3088 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3089 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3090 		    IB_MC_JSTATE_SEND_ONLY_NON));
3091 		mutex_exit(&state->id_ac_mutex);
3092 	}
3093 
3094 	if (recycled) {
3095 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3096 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3097 		ibd_async_reap_group(state, mce, mgid, jstate);
3098 	}
3099 }
3100 
3101 /*
3102  * Find the broadcast address as defined by IPoIB; implicitly
3103  * determines the IBA scope, mtu, tclass etc of the link the
3104  * interface is going to be a member of.
3105  */
3106 static ibt_status_t
3107 ibd_find_bgroup(ibd_state_t *state)
3108 {
3109 	ibt_mcg_attr_t mcg_attr;
3110 	uint_t numg;
3111 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3112 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3113 	    IB_MC_SCOPE_GLOBAL };
3114 	int i, mcgmtu;
3115 	boolean_t found = B_FALSE;
3116 
3117 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3118 	mcg_attr.mc_pkey = state->id_pkey;
3119 	state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK;
3120 
3121 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3122 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3123 
3124 		/*
3125 		 * Look for the IPoIB broadcast group.
3126 		 */
3127 		state->id_mgid.gid_prefix =
3128 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3129 		    ((uint64_t)state->id_scope << 48) |
3130 		    ((uint32_t)(state->id_pkey << 16)));
3131 		mcg_attr.mc_mgid = state->id_mgid;
3132 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3133 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3134 			found = B_TRUE;
3135 			break;
3136 		}
3137 
3138 	}
3139 
3140 	if (!found) {
3141 		ibd_print_warn(state, "IPoIB broadcast group absent");
3142 		return (IBT_FAILURE);
3143 	}
3144 
3145 	/*
3146 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3147 	 */
3148 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3149 	if (state->id_mtu < mcgmtu) {
3150 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3151 		    "greater than port's maximum MTU %d", mcgmtu,
3152 		    state->id_mtu);
3153 		return (IBT_FAILURE);
3154 	}
3155 	state->id_mtu = mcgmtu;
3156 
3157 	return (IBT_SUCCESS);
3158 }
3159 
3160 /*
3161  * Post ibt_attach() initialization.
3162  */
3163 static int
3164 ibd_drv_init(ibd_state_t *state)
3165 {
3166 	kthread_t *kht;
3167 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
3168 	ibt_ud_chan_query_attr_t ud_chan_attr;
3169 	ibt_hca_portinfo_t *port_infop;
3170 	ibt_hca_attr_t hca_attrs;
3171 	ibt_status_t ibt_status;
3172 	ibt_cq_attr_t cq_attr;
3173 	ib_guid_t hca_guid;
3174 	uint32_t real_size;
3175 	uint32_t *ptr;
3176 	char pathname[OBP_MAXPATHLEN];
3177 	uint_t psize, port_infosz;
3178 
3179 	/*
3180 	 * Initialize id_port before ibt_open_hca because of
3181 	 * ordering requirements in port up/down handling.
3182 	 */
3183 	if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS)
3184 		return (DDI_FAILURE);
3185 
3186 	if (ibt_open_hca(state->id_ibt_hdl, hca_guid,
3187 	    &state->id_hca_hdl) != IBT_SUCCESS) {
3188 		DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n");
3189 		return (DDI_FAILURE);
3190 	}
3191 
3192 	mutex_enter(&state->id_link_mutex);
3193 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
3194 	    state->id_port, &port_infop, &psize,
3195 	    &port_infosz);
3196 	if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
3197 		mutex_exit(&state->id_link_mutex);
3198 		DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n");
3199 		(void) ibt_close_hca(state->id_hca_hdl);
3200 		return (DDI_FAILURE);
3201 	}
3202 
3203 	/*
3204 	 * If the link already went down by the time we get here, give up;
3205 	 * we can not even get the gid since that is not valid. We would
3206 	 * fail in ibd_find_bgroup() anyway.
3207 	 */
3208 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
3209 		mutex_exit(&state->id_link_mutex);
3210 		ibt_free_portinfo(port_infop, port_infosz);
3211 		(void) ibt_close_hca(state->id_hca_hdl);
3212 		ibd_print_warn(state, "Port is not active");
3213 		return (DDI_FAILURE);
3214 	}
3215 
3216 	/*
3217 	 * This verifies the Pkey ibnexus handed us is still valid.
3218 	 * This is also the point from which the pkey table for the
3219 	 * port must hold the exact pkey value at the exact index
3220 	 * across port up/downs.
3221 	 */
3222 	if (ibt_pkey2index(state->id_hca_hdl, state->id_port,
3223 	    state->id_pkey, &state->id_pkix) != IBT_SUCCESS) {
3224 		mutex_exit(&state->id_link_mutex);
3225 		ibt_free_portinfo(port_infop, port_infosz);
3226 		DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n");
3227 		(void) ibt_close_hca(state->id_hca_hdl);
3228 		return (DDI_FAILURE);
3229 	}
3230 
3231 	state->id_mtu = (128 << port_infop->p_mtu);
3232 	state->id_sgid = *port_infop->p_sgid_tbl;
3233 	state->id_link_state = GLD_LINKSTATE_UP;
3234 	mutex_exit(&state->id_link_mutex);
3235 
3236 	ibt_free_portinfo(port_infop, port_infosz);
3237 
3238 	state->id_link_speed = ibd_get_portspeed(state);
3239 
3240 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
3241 	ASSERT(ibt_status == IBT_SUCCESS);
3242 
3243 	/*
3244 	 * We need to determine whether the HCA can support checksum
3245 	 * and indicate that to higher layers.
3246 	 */
3247 	if (ibd_csum_send > IBD_CSUM_NONE)
3248 		state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL;
3249 
3250 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
3251 		DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n");
3252 		goto drv_init_fail_find_bgroup;
3253 	}
3254 	state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE;
3255 
3256 	if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
3257 	    &state->id_pd_hdl) != IBT_SUCCESS) {
3258 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n");
3259 		goto drv_init_fail_alloc_pd;
3260 	}
3261 
3262 	/* Initialize the parallel ARP cache and AHs */
3263 	if (ibd_acache_init(state) != DDI_SUCCESS) {
3264 		DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n");
3265 		goto drv_init_fail_acache;
3266 	}
3267 
3268 	/*
3269 	 * Check various tunable limits.
3270 	 */
3271 	if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) {
3272 		ibd_print_warn(state, "Setting #sgl = %d instead of default %d",
3273 		    hca_attrs.hca_max_sgl, IBD_MAX_SQSEG);
3274 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
3275 	} else {
3276 		state->id_max_sqseg = IBD_MAX_SQSEG;
3277 	}
3278 
3279 	/*
3280 	 * First, check #r/s wqes against max channel size.
3281 	 */
3282 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE)
3283 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
3284 	else
3285 		state->id_num_rwqe = IBD_NUM_RWQE;
3286 
3287 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE)
3288 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
3289 	else
3290 		state->id_num_swqe = IBD_NUM_SWQE;
3291 
3292 	/*
3293 	 * Allocate Rx/combined CQ:
3294 	 * Theoretically, there is no point in having more than #rwqe
3295 	 * plus #swqe cqe's, except that the CQ will be signalled for
3296 	 * overflow when the last wqe completes, if none of the previous
3297 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
3298 	 * to make sure such overflow does not occur.
3299 	 */
3300 	cq_attr.cq_sched = NULL;
3301 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
3302 
3303 	if (ibd_separate_cqs == 1) {
3304 		/*
3305 		 * Allocate Receive CQ.
3306 		 */
3307 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
3308 			cq_attr.cq_size = state->id_num_rwqe + 1;
3309 		} else {
3310 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3311 			state->id_num_rwqe = cq_attr.cq_size - 1;
3312 		}
3313 
3314 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3315 			ibd_print_warn(state, "Computed #rwqe %d based on "
3316 			    "requested size and supportable CQ size is less "
3317 			    "than the required threshold %d",
3318 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3319 			goto drv_init_fail_min_rwqes;
3320 		}
3321 
3322 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3323 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3324 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3325 			goto drv_init_fail_alloc_rcq;
3326 		}
3327 
3328 		/*
3329 		 * Allocate Send CQ.
3330 		 */
3331 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
3332 			cq_attr.cq_size = state->id_num_swqe + 1;
3333 		} else {
3334 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3335 			state->id_num_swqe = cq_attr.cq_size - 1;
3336 		}
3337 
3338 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3339 		    &state->id_scq_hdl, &real_size) != IBT_SUCCESS) {
3340 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3341 			goto drv_init_fail_alloc_scq;
3342 		}
3343 	} else {
3344 		/*
3345 		 * Allocate combined Send/Receive CQ.
3346 		 */
3347 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
3348 		    state->id_num_swqe + 1)) {
3349 			cq_attr.cq_size = state->id_num_rwqe +
3350 			    state->id_num_swqe + 1;
3351 		} else {
3352 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3353 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
3354 			    state->id_num_rwqe) / (state->id_num_rwqe +
3355 			    state->id_num_swqe);
3356 			state->id_num_swqe = cq_attr.cq_size - 1 -
3357 			    state->id_num_rwqe;
3358 		}
3359 
3360 		if (state->id_num_rwqe < IBD_RX_THRESHOLD) {
3361 			ibd_print_warn(state, "Computed #rwqe %d based on "
3362 			    "requested size and supportable CQ size is less "
3363 			    "than the required threshold %d",
3364 			    state->id_num_rwqe, IBD_RX_THRESHOLD);
3365 			goto drv_init_fail_min_rwqes;
3366 		}
3367 
3368 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3369 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3370 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3371 			goto drv_init_fail_alloc_rcq;
3372 		}
3373 		state->id_scq_hdl = state->id_rcq_hdl;
3374 	}
3375 
3376 	/*
3377 	 * Print message in case we could not allocate as many wqe's
3378 	 * as was requested. Note that in the combined CQ case, we will
3379 	 * get the following message.
3380 	 */
3381 	if (state->id_num_rwqe != IBD_NUM_RWQE)
3382 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
3383 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
3384 	if (state->id_num_swqe != IBD_NUM_SWQE)
3385 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
3386 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
3387 
3388 	ud_alloc_attr.ud_flags	= IBT_WR_SIGNALED;
3389 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
3390 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
3391 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
3392 	ud_alloc_attr.ud_sizes.cs_sq	= state->id_num_swqe;
3393 	ud_alloc_attr.ud_sizes.cs_rq	= state->id_num_rwqe;
3394 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
3395 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
3396 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
3397 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
3398 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
3399 	ud_alloc_attr.ud_clone_chan	= NULL;
3400 	if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
3401 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) {
3402 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()"
3403 		    "\n");
3404 		goto drv_init_fail_alloc_chan;
3405 	}
3406 
3407 	if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) !=
3408 	    DDI_SUCCESS) {
3409 		DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()");
3410 		goto drv_init_fail_query_chan;
3411 	}
3412 	state->id_qpnum = ud_chan_attr.ud_qpn;
3413 
3414 	/* Initialize the Transmit buffer list */
3415 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
3416 		DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n");
3417 		goto drv_init_fail_txlist_init;
3418 	}
3419 
3420 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
3421 		/* Setup the handler we will use for regular DLPI stuff */
3422 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
3423 		if (ibt_enable_cq_notify(state->id_scq_hdl,
3424 		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
3425 			DPRINT(10, "ibd_drv_init : failed in"
3426 			    " ibt_enable_cq_notify()\n");
3427 			goto drv_init_fail_cq_notify;
3428 		}
3429 	}
3430 
3431 	/* Create the service fifos before we start receiving */
3432 	if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos,
3433 	    state)) == NULL) {
3434 		DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n");
3435 		goto drv_init_fail_srv_fifo;
3436 	}
3437 
3438 	/* Initialize the Receive buffer list */
3439 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
3440 		DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n");
3441 		goto drv_init_fail_rxlist_init;
3442 	}
3443 
3444 	/* Join to IPoIB broadcast group as required by IPoIB */
3445 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
3446 		DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n");
3447 		goto drv_init_fail_join_group;
3448 	}
3449 
3450 	/* Create the async thread */
3451 	if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
3452 	    TS_RUN, minclsyspri)) == NULL) {
3453 		/* Do we have to specially leave the group? */
3454 		DPRINT(10, "ibd_drv_init : failed in thread_create\n");
3455 		goto drv_init_fail_thread_create;
3456 	}
3457 	state->id_async_thrid = kht->t_did;
3458 
3459 	/*
3460 	 * The local mac address is now known. Create the IPoIB
3461 	 * address.
3462 	 */
3463 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
3464 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3465 	state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr;
3466 
3467 	/*
3468 	 * Similarly, program in the broadcast mac address.
3469 	 */
3470 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix,
3471 	    state->id_mgid.gid_guid);
3472 	state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr;
3473 
3474 	ptr = (uint32_t *)&state->id_macaddr;
3475 	DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n",
3476 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3477 	ptr = (uint32_t *)&state->id_bcaddr;
3478 	DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n",
3479 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3480 	DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n",
3481 	    state->id_pkey, state->id_mgid.gid_prefix,
3482 	    state->id_mgid.gid_guid);
3483 	DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n",
3484 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3485 	DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey);
3486 	DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu);
3487 	(void) ddi_pathname(state->id_dip, pathname);
3488 	DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname);
3489 
3490 	return (DDI_SUCCESS);
3491 
3492 drv_init_fail_thread_create:
3493 	ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL);
3494 
3495 drv_init_fail_join_group:
3496 	ibd_fini_rxlist(state);
3497 
3498 drv_init_fail_rxlist_init:
3499 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3500 
3501 drv_init_fail_srv_fifo:
3502 drv_init_fail_cq_notify:
3503 	ibd_fini_txlist(state);
3504 
3505 drv_init_fail_txlist_init:
3506 drv_init_fail_query_chan:
3507 	if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS)
3508 		DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()");
3509 
3510 drv_init_fail_alloc_chan:
3511 	if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) !=
3512 	    IBT_SUCCESS))
3513 		DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()");
3514 
3515 drv_init_fail_alloc_scq:
3516 	if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS)
3517 		DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()");
3518 
3519 drv_init_fail_min_rwqes:
3520 drv_init_fail_alloc_rcq:
3521 	ibd_acache_fini(state);
3522 drv_init_fail_acache:
3523 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3524 		DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()");
3525 
3526 drv_init_fail_alloc_pd:
3527 	ibt_free_mcg_info(state->id_mcinfo, 1);
3528 drv_init_fail_find_bgroup:
3529 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
3530 		DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()");
3531 
3532 	return (DDI_FAILURE);
3533 }
3534 
3535 /*
3536  * Allocate the statically allocated Tx buffer list.
3537  */
3538 static int
3539 ibd_init_txlist(ibd_state_t *state)
3540 {
3541 	ibd_swqe_t *swqe;
3542 	int i;
3543 
3544 	for (i = 0; i < state->id_num_swqe; i++) {
3545 		if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) {
3546 			DPRINT(10, "ibd_init_txlist : failed in "
3547 			    "ibd_alloc_swqe()\n");
3548 			ibd_fini_txlist(state);
3549 			return (DDI_FAILURE);
3550 		}
3551 
3552 		/* add to list */
3553 		state->id_tx_list.dl_cnt++;
3554 		if (state->id_tx_list.dl_head == NULL) {
3555 			swqe->swqe_prev = NULL;
3556 			swqe->swqe_next = NULL;
3557 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3558 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3559 		} else {
3560 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3561 			swqe->swqe_next = NULL;
3562 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3563 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3564 		}
3565 	}
3566 
3567 	return (DDI_SUCCESS);
3568 }
3569 
3570 /*
3571  * Free the statically allocated Tx buffer list.
3572  */
3573 static void
3574 ibd_fini_txlist(ibd_state_t *state)
3575 {
3576 	ibd_swqe_t *node;
3577 
3578 	mutex_enter(&state->id_tx_list.dl_mutex);
3579 	while (state->id_tx_list.dl_head != NULL) {
3580 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3581 		state->id_tx_list.dl_head = node->swqe_next;
3582 		state->id_tx_list.dl_cnt--;
3583 		ASSERT(state->id_tx_list.dl_cnt >= 0);
3584 		ibd_free_swqe(state, node);
3585 	}
3586 	mutex_exit(&state->id_tx_list.dl_mutex);
3587 }
3588 
3589 /*
3590  * Allocate a single send wqe and register it so it is almost
3591  * ready to be posted to the hardware.
3592  */
3593 static int
3594 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe)
3595 {
3596 	ibt_mr_attr_t mem_attr;
3597 	ibd_swqe_t *swqe;
3598 
3599 	swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP);
3600 	*wqe = swqe;
3601 	swqe->swqe_type = IBD_WQE_SEND;
3602 	swqe->swqe_next = NULL;
3603 	swqe->swqe_prev = NULL;
3604 	swqe->swqe_im_mblk = NULL;
3605 	swqe->w_mdtinfo = NULL;
3606 
3607 	/* alloc copy buffer, must be max size to handle multiple mblk case */
3608 	swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP);
3609 
3610 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3611 	mem_attr.mr_len = state->id_mtu;
3612 	mem_attr.mr_as = NULL;
3613 	mem_attr.mr_flags = IBT_MR_SLEEP;
3614 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3615 	    &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) !=
3616 	    IBT_SUCCESS) {
3617 		DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()");
3618 		kmem_free(swqe->swqe_copybuf.ic_bufaddr,
3619 		    state->id_mtu);
3620 		kmem_free(swqe, sizeof (ibd_swqe_t));
3621 		return (DDI_FAILURE);
3622 	}
3623 
3624 	swqe->swqe_copybuf.ic_sgl.ds_va =
3625 	    (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr;
3626 	swqe->swqe_copybuf.ic_sgl.ds_key =
3627 	    swqe->swqe_copybuf.ic_mr_desc.md_lkey;
3628 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3629 
3630 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3631 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3632 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3633 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3634 
3635 	/* These are set in send */
3636 	swqe->w_swr.wr_nds = 0;
3637 	swqe->w_swr.wr_sgl = NULL;
3638 
3639 	return (DDI_SUCCESS);
3640 }
3641 
3642 /*
3643  * Free an allocated send wqe.
3644  */
3645 static void
3646 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3647 {
3648 
3649 	if (ibt_deregister_mr(state->id_hca_hdl,
3650 	    swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3651 		DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()");
3652 		return;
3653 	}
3654 	kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu);
3655 	kmem_free(swqe, sizeof (ibd_swqe_t));
3656 }
3657 
3658 /*
3659  * Post a rwqe to the hardware and add it to the Rx list. The
3660  * "recycle" parameter indicates whether an old rwqe is being
3661  * recycled, or this is a new one.
3662  */
3663 static int
3664 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3665 {
3666 	if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) !=
3667 	    IBT_SUCCESS) {
3668 		DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()");
3669 		return (DDI_FAILURE);
3670 	}
3671 	atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3672 
3673 	/*
3674 	 * Buffers being recycled are already in the list.
3675 	 */
3676 	if (recycle)
3677 		return (DDI_SUCCESS);
3678 
3679 	mutex_enter(&state->id_rx_list.dl_mutex);
3680 	if (state->id_rx_list.dl_head == NULL) {
3681 		rwqe->rwqe_prev = NULL;
3682 		rwqe->rwqe_next = NULL;
3683 		state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3684 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3685 	} else {
3686 		rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3687 		rwqe->rwqe_next = NULL;
3688 		state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3689 		state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3690 	}
3691 	mutex_exit(&state->id_rx_list.dl_mutex);
3692 
3693 	return (DDI_SUCCESS);
3694 }
3695 
3696 /*
3697  * Allocate the statically allocated Rx buffer list.
3698  */
3699 static int
3700 ibd_init_rxlist(ibd_state_t *state)
3701 {
3702 	ibd_rwqe_t *rwqe;
3703 	int i;
3704 
3705 	for (i = 0; i < state->id_num_rwqe; i++) {
3706 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3707 			ibd_fini_rxlist(state);
3708 			return (DDI_FAILURE);
3709 		}
3710 
3711 		if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) {
3712 			ibd_free_rwqe(state, rwqe);
3713 			ibd_fini_rxlist(state);
3714 			return (DDI_FAILURE);
3715 		}
3716 	}
3717 
3718 	return (DDI_SUCCESS);
3719 }
3720 
3721 /*
3722  * Free the statically allocated Rx buffer list.
3723  *
3724  */
3725 static void
3726 ibd_fini_rxlist(ibd_state_t *state)
3727 {
3728 	ibd_rwqe_t *node;
3729 
3730 	mutex_enter(&state->id_rx_list.dl_mutex);
3731 	while (state->id_rx_list.dl_head != NULL) {
3732 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3733 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3734 		state->id_rx_list.dl_cnt--;
3735 		ASSERT(state->id_rx_list.dl_cnt >= 0);
3736 
3737 		ibd_free_rwqe(state, node);
3738 	}
3739 	mutex_exit(&state->id_rx_list.dl_mutex);
3740 }
3741 
3742 /*
3743  * Allocate a single recv wqe and register it so it is almost
3744  * ready to be posted to the hardware.
3745  */
3746 static int
3747 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3748 {
3749 	ibt_mr_attr_t mem_attr;
3750 	ibd_rwqe_t *rwqe;
3751 
3752 	if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3753 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3754 		return (DDI_FAILURE);
3755 	}
3756 	*wqe = rwqe;
3757 	rwqe->rwqe_type = IBD_WQE_RECV;
3758 	rwqe->w_state = state;
3759 	rwqe->rwqe_next = NULL;
3760 	rwqe->rwqe_prev = NULL;
3761 	rwqe->w_freeing_wqe = B_FALSE;
3762 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3763 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3764 
3765 	if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3766 	    IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) {
3767 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2");
3768 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3769 		return (DDI_FAILURE);
3770 	}
3771 
3772 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3773 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3774 	    NULL) {
3775 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3776 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3777 		    state->id_mtu + IPOIB_GRH_SIZE);
3778 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3779 		return (DDI_FAILURE);
3780 	}
3781 
3782 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3783 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3784 	mem_attr.mr_as = NULL;
3785 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3786 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3787 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3788 	    IBT_SUCCESS) {
3789 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3790 		rwqe->w_freeing_wqe = B_TRUE;
3791 		freemsg(rwqe->rwqe_im_mblk);
3792 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3793 		    state->id_mtu + IPOIB_GRH_SIZE);
3794 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3795 		return (DDI_FAILURE);
3796 	}
3797 
3798 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3799 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3800 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3801 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3802 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3803 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3804 	rwqe->w_rwr.wr_nds = 1;
3805 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3806 
3807 	return (DDI_SUCCESS);
3808 }
3809 
3810 /*
3811  * Free an allocated recv wqe.
3812  */
3813 static void
3814 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3815 {
3816 
3817 	if (ibt_deregister_mr(state->id_hca_hdl,
3818 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3819 		DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()");
3820 		return;
3821 	}
3822 
3823 	/*
3824 	 * Indicate to the callback function that this rwqe/mblk
3825 	 * should not be recycled. The freemsg() will invoke
3826 	 * ibd_freemsg_cb().
3827 	 */
3828 	if (rwqe->rwqe_im_mblk != NULL) {
3829 		rwqe->w_freeing_wqe = B_TRUE;
3830 		freemsg(rwqe->rwqe_im_mblk);
3831 	}
3832 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3833 	    state->id_mtu + IPOIB_GRH_SIZE);
3834 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3835 }
3836 
3837 /*
3838  * Delete the rwqe being freed from the rx list.
3839  */
3840 static void
3841 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3842 {
3843 	mutex_enter(&state->id_rx_list.dl_mutex);
3844 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3845 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3846 	else
3847 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3848 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3849 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3850 	else
3851 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3852 	mutex_exit(&state->id_rx_list.dl_mutex);
3853 }
3854 
3855 /*
3856  * Pre ibt_detach() deconstruction.
3857  */
3858 static void
3859 ibd_drv_fini(ibd_state_t *state)
3860 {
3861 	ib_gid_t mgid;
3862 	ibd_mce_t *mce;
3863 	ibt_status_t status;
3864 	uint8_t jstate;
3865 
3866 	/*
3867 	 * Desubscribe from trap notices; we will be tearing down
3868 	 * the mcg lists soon. Make sure the trap handler does nothing
3869 	 * even if it is invoked (ie till we invoke ibt_detach()).
3870 	 */
3871 	ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
3872 	mutex_enter(&state->id_trap_lock);
3873 	state->id_trap_stop = B_TRUE;
3874 	while (state->id_trap_inprog > 0)
3875 		cv_wait(&state->id_trap_cv, &state->id_trap_lock);
3876 	mutex_exit(&state->id_trap_lock);
3877 
3878 	/*
3879 	 * Flushing the channel ensures that all pending WQE's
3880 	 * are marked with flush_error and handed to the CQ. It
3881 	 * does not guarantee the invocation of the CQ handler.
3882 	 * This call is guaranteed to return successfully for UD QPNs.
3883 	 */
3884 	status = ibt_flush_channel(state->id_chnl_hdl);
3885 	ASSERT(status == IBT_SUCCESS);
3886 
3887 	/*
3888 	 * We possibly need a loop here to wait for all the Tx
3889 	 * callbacks to happen. The Tx handlers will retrieve
3890 	 * held resources like AH ac_ref count, registered memory
3891 	 * and possibly ASYNC_REAP requests. Rx interrupts were already
3892 	 * turned off (in ibd_detach()); turn off Tx interrupts and
3893 	 * poll. By the time the polling returns an empty indicator,
3894 	 * we are sure we have seen all pending Tx callbacks. Note
3895 	 * that after the ibt_set_cq_handler() returns, the old handler
3896 	 * is guaranteed not to be invoked anymore.
3897 	 */
3898 	if (ibd_separate_cqs == 1)
3899 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
3900 	ibd_poll_compq(state, state->id_scq_hdl);
3901 
3902 	/*
3903 	 * No more async requests will be posted since the device has been
3904 	 * unregistered; completion handlers have been turned off, so Tx
3905 	 * handler will not cause any more ASYNC_REAP requests. Queue a
3906 	 * request for the async thread to exit, which will be serviced
3907 	 * after any pending ones. This can take a while, specially if the
3908 	 * SM is unreachable, since IBMF will slowly timeout each SM request
3909 	 * issued by the async thread. Reap the thread before continuing on,
3910 	 * we do not want it to be lingering in modunloaded code.
3911 	 */
3912 	ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT);
3913 	thread_join(state->id_async_thrid);
3914 
3915 	/*
3916 	 * We can not be in promiscuous mode anymore, upper layers
3917 	 * would have made a request to disable it (if ever set previously)
3918 	 * before the detach is allowed to progress to this point; and the
3919 	 * aysnc thread would have processed that request by now. Thus the
3920 	 * nonmember list is guaranteed empty at this point.
3921 	 */
3922 	ASSERT(state->id_prom_op != COMPLETED);
3923 
3924 	/*
3925 	 * Drop all residual full/non membership. This includes full
3926 	 * membership to the broadcast group, and any nonmembership
3927 	 * acquired during transmits. We do this after the Tx completion
3928 	 * handlers are done, since those might result in some late
3929 	 * leaves; this also eliminates a potential race with that
3930 	 * path wrt the mc full list insert/delete. Trap handling
3931 	 * has also been suppressed at this point. Thus, no locks
3932 	 * are required while traversing the mc full list.
3933 	 */
3934 	DPRINT(2, "ibd_drv_fini : clear full cache entries");
3935 	mce = list_head(&state->id_mc_full);
3936 	while (mce != NULL) {
3937 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
3938 		jstate = mce->mc_jstate;
3939 		mce = list_next(&state->id_mc_full, mce);
3940 		ibd_leave_group(state, mgid, jstate);
3941 	}
3942 
3943 	ibt_free_mcg_info(state->id_mcinfo, 1);
3944 
3945 	/*
3946 	 * Kill the channel now; guaranteed to return successfully
3947 	 * for UD QPNs.
3948 	 */
3949 	status = ibt_free_channel(state->id_chnl_hdl);
3950 	ASSERT(status == IBT_SUCCESS);
3951 
3952 	/*
3953 	 * Kill the CQ; all completion handlers are guaranteed to
3954 	 * have terminated by the time this returns. Since we killed
3955 	 * the QPN above, we can not receive the IBT_CQ_BUSY error.
3956 	 */
3957 	status = ibt_free_cq(state->id_rcq_hdl);
3958 	ASSERT(status == IBT_SUCCESS);
3959 
3960 	if (ibd_separate_cqs == 1) {
3961 		status = ibt_free_cq(state->id_scq_hdl);
3962 		ASSERT(status == IBT_SUCCESS);
3963 	}
3964 
3965 	/*
3966 	 * We killed the receive interrupts, thus, we will not be
3967 	 * required to handle received packets anymore. Thus, kill
3968 	 * service threads since they are not going to be used anymore.
3969 	 */
3970 	unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos);
3971 
3972 	/*
3973 	 * Since these following will act on the Rx/Tx list, which
3974 	 * is also looked at by the Rx/Tx handlers, keep them around
3975 	 * till all handlers are guaranteed to have completed.
3976 	 */
3977 	ibd_fini_rxlist(state);
3978 	ibd_fini_txlist(state);
3979 
3980 	/*
3981 	 * Clean up the active AH hash list.
3982 	 */
3983 	mod_hash_destroy_hash(state->id_ah_active_hash);
3984 
3985 	/*
3986 	 * Free parallel ARP cache and AHs; we are sure all of these
3987 	 * resources have been released by the Tx completion handler.
3988 	 */
3989 	ibd_acache_fini(state);
3990 
3991 	/*
3992 	 * We freed the QPN, all the MRs and AHs. This step should not
3993 	 * fail; print a warning message if it does fail, due to a bug
3994 	 * in the driver.
3995 	 */
3996 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3997 		ibd_print_warn(state, "failed to free protection domain");
3998 
3999 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
4000 		ibd_print_warn(state, "failed to close HCA device");
4001 }
4002 
4003 /*
4004  * IBA Rx/Tx completion queue handler. Guaranteed to be single
4005  * threaded and nonreentrant for this CQ. When using combined CQ,
4006  * this handles Tx and Rx completions. With separate CQs, this handles
4007  * only Rx completions.
4008  */
4009 /* ARGSUSED */
4010 static void
4011 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4012 {
4013 	ibd_state_t *state = (ibd_state_t *)arg;
4014 
4015 	atomic_add_64(&state->id_num_intrs, 1);
4016 	(void) gld_intr(state->id_macinfo);
4017 }
4018 
4019 /*
4020  * Separate CQ handler for Tx completions, when the Tx CQ is in
4021  * interrupt driven mode.
4022  */
4023 /* ARGSUSED */
4024 static void
4025 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4026 {
4027 	ibd_state_t *state = (ibd_state_t *)arg;
4028 
4029 	atomic_add_64(&state->id_num_intrs, 1);
4030 
4031 	/*
4032 	 * Poll for completed entries; the CQ will not interrupt any
4033 	 * more for completed packets.
4034 	 */
4035 	ibd_poll_compq(state, state->id_scq_hdl);
4036 
4037 	/*
4038 	 * Now enable CQ notifications; all completions originating now
4039 	 * will cause new interrupts.
4040 	 */
4041 	if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) !=
4042 	    IBT_SUCCESS) {
4043 		/*
4044 		 * We do not expect a failure here.
4045 		 */
4046 		DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
4047 	}
4048 
4049 	/*
4050 	 * Repoll to catch all packets that might have completed after
4051 	 * we finished the first poll loop and before interrupts got
4052 	 * armed.
4053 	 */
4054 	ibd_poll_compq(state, state->id_scq_hdl);
4055 }
4056 
4057 /*
4058  * Multicast group create/delete trap handler. These will be delivered
4059  * on a kernel thread (handling can thus block) and can be invoked
4060  * concurrently. The handler can be invoked anytime after it is
4061  * registered and before ibt_detach().
4062  */
4063 /* ARGSUSED */
4064 static void
4065 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4066     ibt_subnet_event_t *event)
4067 {
4068 	ibd_state_t *state = (ibd_state_t *)arg;
4069 	ibd_req_t *req;
4070 
4071 	/*
4072 	 * The trap handler will get invoked once for every event for
4073 	 * evert port. The input "gid" is the GID0 of the port the
4074 	 * trap came in on; we just need to act on traps that came
4075 	 * to our port, meaning the port on which the ipoib interface
4076 	 * resides. Since ipoib uses GID0 of the port, we just match
4077 	 * the gids to check whether we need to handle the trap.
4078 	 */
4079 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4080 		return;
4081 
4082 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4083 
4084 	switch (code) {
4085 		case IBT_SM_EVENT_UNAVAILABLE:
4086 			/*
4087 			 * If we are in promiscuous mode or have
4088 			 * sendnonmembers, we need to print a warning
4089 			 * message right now. Else, just store the
4090 			 * information, print when we enter promiscuous
4091 			 * mode or attempt nonmember send. We might
4092 			 * also want to stop caching sendnonmember.
4093 			 */
4094 			ibd_print_warn(state, "IBA multicast support "
4095 			    "degraded due to unavailability of multicast "
4096 			    "traps");
4097 			break;
4098 		case IBT_SM_EVENT_AVAILABLE:
4099 			/*
4100 			 * If we printed a warning message above or
4101 			 * while trying to nonmember send or get into
4102 			 * promiscuous mode, print an okay message.
4103 			 */
4104 			ibd_print_warn(state, "IBA multicast support "
4105 			    "restored due to availability of multicast "
4106 			    "traps");
4107 			break;
4108 		case IBT_SM_EVENT_MCG_CREATED:
4109 		case IBT_SM_EVENT_MCG_DELETED:
4110 			/*
4111 			 * Common processing of creation/deletion traps.
4112 			 * First check if the instance is being
4113 			 * [de]initialized; back off then, without doing
4114 			 * anything more, since we are not sure if the
4115 			 * async thread is around, or whether we might
4116 			 * be racing with the detach code in ibd_drv_fini()
4117 			 * that scans the mcg list.
4118 			 */
4119 			if (!ibd_async_safe(state))
4120 				return;
4121 
4122 			req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP);
4123 			req->rq_gid = event->sm_notice_gid;
4124 			req->rq_ptr = (void *)code;
4125 			ibd_queue_work_slot(state, req, ASYNC_TRAP);
4126 			break;
4127 	}
4128 }
4129 
4130 static void
4131 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4132 {
4133 	ib_gid_t mgid = req->rq_gid;
4134 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4135 
4136 	DPRINT(10, "ibd_async_trap : %d\n", code);
4137 
4138 	/*
4139 	 * Atomically search the nonmember and sendonlymember lists and
4140 	 * delete.
4141 	 */
4142 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4143 
4144 	if (state->id_prom_op == COMPLETED) {
4145 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4146 
4147 		/*
4148 		 * If in promiscuous mode, try to join/attach to the new
4149 		 * mcg. Given the unreliable out-of-order mode of trap
4150 		 * delivery, we can never be sure whether it is a problem
4151 		 * if the join fails. Thus, we warn the admin of a failure
4152 		 * if this was a creation trap. Note that the trap might
4153 		 * actually be reporting a long past event, and the mcg
4154 		 * might already have been deleted, thus we might be warning
4155 		 * in vain.
4156 		 */
4157 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4158 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4159 			ibd_print_warn(state, "IBA promiscuous mode missed "
4160 			    "new multicast gid %016llx:%016llx",
4161 			    (u_longlong_t)mgid.gid_prefix,
4162 			    (u_longlong_t)mgid.gid_guid);
4163 	}
4164 
4165 	/*
4166 	 * Free the request slot allocated by the subnet event thread.
4167 	 */
4168 	kmem_free(req, sizeof (ibd_req_t));
4169 
4170 	ibd_async_done(state);
4171 }
4172 
4173 /*
4174  * GLD entry point to reset hardware.
4175  */
4176 /* ARGSUSED */
4177 static int
4178 ibd_reset(gld_mac_info_t *macinfo)
4179 {
4180 	/*
4181 	 * This will be invoked from Style 1 open() and Style 2
4182 	 * attach() routines, ie just before the interface starts
4183 	 * getting used.
4184 	 */
4185 	return (GLD_SUCCESS);
4186 }
4187 
4188 /*
4189  * GLD entry point to start hardware.
4190  */
4191 /* ARGSUSED */
4192 static int
4193 ibd_start(gld_mac_info_t *macinfo)
4194 {
4195 	return (GLD_SUCCESS);
4196 }
4197 
4198 /*
4199  * GLD entry point to stop hardware from receiving packets.
4200  */
4201 /* ARGSUSED */
4202 static int
4203 ibd_stop(gld_mac_info_t *macinfo)
4204 {
4205 #ifdef RUN_PERFORMANCE
4206 	ibd_perf((ibd_state_t *)macinfo->gldm_private);
4207 #endif
4208 	return (GLD_SUCCESS);
4209 }
4210 
4211 /*
4212  * GLD entry point to modify device's mac address. We do not
4213  * allow address modifications.
4214  */
4215 static int
4216 ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr)
4217 {
4218 	ibd_state_t *state;
4219 
4220 	state = (ibd_state_t *)macinfo->gldm_private;
4221 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4222 		return (GLD_SUCCESS);
4223 	else
4224 		return (GLD_FAILURE);
4225 }
4226 
4227 /*
4228  * The blocking part of the IBA join/leave operations are done out
4229  * of here on the async thread.
4230  */
4231 static void
4232 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4233 {
4234 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4235 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4236 
4237 	if (op == ASYNC_JOIN) {
4238 		int ret = ERRORED;
4239 
4240 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL)
4241 			ret = COMPLETED;
4242 
4243 		state->id_multi_op = ret;
4244 	} else {
4245 		/*
4246 		 * Here, we must search for the proper mcg_info and
4247 		 * use that to leave the group.
4248 		 */
4249 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4250 	}
4251 }
4252 
4253 /*
4254  * GLD entry point for multicast enable/disable requests.
4255  * Invoked by GLD only on the first multicast enable for a specific
4256  * address (GLD is free to retry ocassionally if we return RETRY),
4257  * and on last disable of the same address. Just queue the operation
4258  * to the async thread.
4259  */
4260 static int
4261 ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op)
4262 {
4263 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4264 	ipoib_mac_t *mcast;
4265 	ib_gid_t mgid;
4266 	ib_qpn_t mcqpn;
4267 	int ret;
4268 
4269 	/*
4270 	 * The incoming multicast address might not be aligned properly
4271 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4272 	 * it to look like one though, to get the offsets of the mc gid,
4273 	 * since we know we are not going to dereference any values with
4274 	 * the ipoib_mac_t pointer.
4275 	 */
4276 	mcast = (ipoib_mac_t *)mcmac;
4277 
4278 	/*
4279 	 * Check validity of MCG address. We could additionally check
4280 	 * that a enable/disable is not being issued on the "broadcast"
4281 	 * mcg, but since this operation is only invokable by priviledged
4282 	 * programs anyway, we allow the flexibility to those dlpi apps.
4283 	 * Note that we do not validate the "scope" of the IBA mcg.
4284 	 */
4285 	bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t));
4286 	if (mcqpn != htonl(IB_MC_QPN))
4287 		return (GLD_FAILURE);
4288 
4289 	/*
4290 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4291 	 * nothing (ie we stay JOINed to the broadcast group done in
4292 	 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
4293 	 * requires to be joined to broadcast groups at all times.
4294 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4295 	 * depends on this.
4296 	 */
4297 	if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr,
4298 	    IPOIB_ADDRL) == 0)
4299 		return (GLD_SUCCESS);
4300 
4301 	ibd_n2h_gid(mcast, &mgid);
4302 
4303 	if (op == GLD_MULTI_ENABLE) {
4304 		DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n",
4305 		    mgid.gid_prefix, mgid.gid_guid);
4306 		ret = GLD_RETRY;
4307 		mutex_enter(&state->id_mc_mutex);
4308 		if (state->id_multi_op == NOTSTARTED) {
4309 			state->id_multi_req.rq_gid = mgid;
4310 			ibd_queue_work_slot(state, &state->id_multi_req,
4311 			    ASYNC_JOIN);
4312 			state->id_multi_op = ONGOING;
4313 			bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL);
4314 		} else if (bcmp(&state->id_multi_addr, mcast,
4315 		    IPOIB_ADDRL) == 0) {
4316 			if (state->id_multi_op != ONGOING) {
4317 				if (state->id_multi_op == COMPLETED)
4318 					ret = GLD_SUCCESS;
4319 				else if (state->id_multi_op == ERRORED)
4320 					ret = GLD_FAILURE;
4321 				if (state->id_multi_queued) {
4322 					state->id_multi_queued = B_FALSE;
4323 					ibd_queue_work_slot(state,
4324 					    &state->id_multi_req, ASYNC_POKE);
4325 				} else {
4326 					state->id_multi_op = NOTSTARTED;
4327 				}
4328 			}
4329 		} else {
4330 			/*
4331 			 * Hmmm, a set was tried on another mcg. We
4332 			 * need to make sure to gld_sched for this
4333 			 * stream to retry once the ongoing one terminates.
4334 			 * The gld_sched out of the async thread on completion
4335 			 * of the mcg join is not enough; because the queued
4336 			 * stream might come in and get a RETRY again because
4337 			 * the mcg join result has still not been reaped by
4338 			 * the originator. If gld_sched ensured that streams
4339 			 * get tried in the order they received RETRYs, things
4340 			 * would be simpler.
4341 			 */
4342 			state->id_multi_queued = B_TRUE;
4343 		}
4344 		mutex_exit(&state->id_mc_mutex);
4345 	} else {
4346 		ibd_mce_t *mce;
4347 		DPRINT(1, "ibd_set_multicast : unset_multicast : "
4348 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4349 		ret = GLD_SUCCESS;
4350 		mutex_enter(&state->id_mc_mutex);
4351 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
4352 		mutex_exit(&state->id_mc_mutex);
4353 		/*
4354 		 * GLD should not have invoked us unless the mcg was
4355 		 * added in the past.
4356 		 */
4357 		ASSERT(mce != NULL);
4358 		ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0);
4359 		ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE);
4360 	}
4361 	return (ret);
4362 }
4363 
4364 /*
4365  * The blocking part of the IBA promiscuous operations are done
4366  * out of here on the async thread. The dlpireq parameter indicates
4367  * whether this invocation is due to a dlpi request or due to
4368  * a port up/down event.
4369  */
4370 static void
4371 ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq)
4372 {
4373 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4374 	ib_gid_t mgid;
4375 
4376 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4377 
4378 	/*
4379 	 * Mark the request slot as empty and reusable for the
4380 	 * next promiscuous set request.
4381 	 */
4382 	if (dlpireq)
4383 		state->id_prom_op = NOTSTARTED;
4384 
4385 	while (mce != NULL) {
4386 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4387 		mce = list_next(&state->id_mc_non, mce);
4388 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4389 	}
4390 }
4391 
4392 /*
4393  * The blocking part of the IBA promiscuous operations are done
4394  * out of here on the async thread. The dlpireq parameter indicates
4395  * whether this invocation is due to a dlpi request or due to
4396  * a port up/down event.
4397  */
4398 static void
4399 ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq)
4400 {
4401 	ibt_mcg_attr_t mcg_attr;
4402 	ibt_mcg_info_t *mcg_info;
4403 	ib_gid_t mgid;
4404 	uint_t numg;
4405 	int i;
4406 
4407 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4408 
4409 	/*
4410 	 * Obtain all active MC groups on the IB fabric with
4411 	 * specified criteria (scope + Pkey + Qkey + mtu).
4412 	 */
4413 	bzero(&mcg_attr, sizeof (mcg_attr));
4414 	mcg_attr.mc_pkey = state->id_pkey;
4415 	mcg_attr.mc_scope = state->id_scope;
4416 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4417 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4418 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4419 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4420 	    IBT_SUCCESS) {
4421 		ibd_print_warn(state, "Could not get list of IBA multicast "
4422 		    "groups");
4423 		if (dlpireq)
4424 			state->id_prom_op = ERRORED;
4425 		return;
4426 	}
4427 
4428 	/*
4429 	 * Iterate over the returned mcg's and join as NonMember
4430 	 * to the IP mcg's.
4431 	 */
4432 	for (i = 0; i < numg; i++) {
4433 		/*
4434 		 * Do a NonMember JOIN on the MC group.
4435 		 */
4436 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4437 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4438 			ibd_print_warn(state, "IBA promiscuous mode missed "
4439 			    "multicast gid %016llx:%016llx",
4440 			    (u_longlong_t)mgid.gid_prefix,
4441 			    (u_longlong_t)mgid.gid_guid);
4442 	}
4443 
4444 	ibt_free_mcg_info(mcg_info, numg);
4445 	if (dlpireq)
4446 		state->id_prom_op = COMPLETED;
4447 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4448 }
4449 
4450 /*
4451  * GLD entry point for multicast promiscuous enable/disable requests.
4452  * GLD assumes phys state receives more packets than multi state,
4453  * which is not true for IPoIB. Thus, treat the multi and phys
4454  * promiscuous states the same way to work with GLD's assumption.
4455  */
4456 static int
4457 ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode)
4458 {
4459 	ibd_state_t *state;
4460 	int ret;
4461 
4462 	state = (ibd_state_t *)macinfo->gldm_private;
4463 	switch (mode) {
4464 		case GLD_MAC_PROMISC_PHYS:
4465 		case GLD_MAC_PROMISC_MULTI:
4466 			DPRINT(1, "ibd_set_promiscuous : set_promisc : %d",
4467 			    mode);
4468 			/*
4469 			 * Look at gld: this might be getting
4470 			 * called because someone is turning off
4471 			 * prom_phys. Nothing needs to be done in
4472 			 * that case.
4473 			 */
4474 			ret = GLD_RETRY;
4475 			mutex_enter(&state->id_mc_mutex);
4476 			switch (state->id_prom_op) {
4477 				case NOTSTARTED:
4478 					ibd_queue_work_slot(state,
4479 					    &state->id_prom_req, ASYNC_PROMON);
4480 					state->id_prom_op = ONGOING;
4481 					break;
4482 				case COMPLETED:
4483 					ret = GLD_SUCCESS;
4484 					break;
4485 				case ERRORED:
4486 					state->id_prom_op = NOTSTARTED;
4487 					ret = GLD_FAILURE;
4488 			}
4489 			/*
4490 			 * Else in the ONGOING case, nothing special
4491 			 * needs to be done; the async thread will poke
4492 			 * all streams. A prior set, or the last unset
4493 			 * request is still in the async queue.
4494 			 */
4495 			mutex_exit(&state->id_mc_mutex);
4496 			return (ret);
4497 		case GLD_MAC_PROMISC_NONE:
4498 			DPRINT(1, "ibd_set_promiscuous : unset_promisc");
4499 			/*
4500 			 * Look at gld: this might be getting
4501 			 * called because someone is turning off
4502 			 * prom_phys or prom_multi. Mark operation
4503 			 * as ongoing, to prevent a subsequent set
4504 			 * operation from using the request slot
4505 			 * unless the async thread is ready to give
4506 			 * it up. The async thread will mark the
4507 			 * request slot as usable as soon as it
4508 			 * starts doing the unset operation.
4509 			 */
4510 			ASSERT(state->id_prom_op == COMPLETED);
4511 			state->id_prom_op = ONGOING;
4512 			ibd_queue_work_slot(state, &state->id_prom_req,
4513 			    ASYNC_PROMOFF);
4514 			return (GLD_SUCCESS);
4515 		default:
4516 			return (GLD_NOTSUPPORTED);
4517 	}
4518 }
4519 
4520 /*
4521  * GLD entry point for gathering statistics.
4522  */
4523 static int
4524 ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp)
4525 {
4526 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4527 
4528 	sp->glds_errrcv = 0;
4529 	sp->glds_underflow = 0;
4530 	sp->glds_missed = 0;
4531 
4532 	sp->glds_overflow = state->id_tx_short;	/* Tx overflow */
4533 	sp->glds_speed = state->id_link_speed;
4534 	sp->glds_media = GLDM_IB;
4535 	sp->glds_errxmt = state->id_ah_error;	/* failed AH translation */
4536 	sp->glds_norcvbuf = state->id_rx_short;	/* # times below water mark */
4537 	sp->glds_intr = state->id_num_intrs;	/* number of intrs */
4538 
4539 	return (GLD_SUCCESS);
4540 }
4541 
4542 /*
4543  * Arrange for a Tx request that is failing, or has already failed due to
4544  * Tx descriptor shortage to be retried soon. Used mostly with poll based
4545  * Tx completion, since gld_sched() can not be invoked in ibd_send() context
4546  * due to potential single processor deadlock (when the ibd_send() is
4547  * caused by gld_recv()).
4548  */
4549 static void
4550 ibd_tx_sched(ibd_state_t *state)
4551 {
4552 	mutex_enter(&state->id_sched_lock);
4553 	/*
4554 	 * If a sched request is already enqueued, do not try to do
4555 	 * that again, since the async work request list would get
4556 	 * corrupted.
4557 	 */
4558 	if (!state->id_sched_queued) {
4559 		state->id_sched_queued = B_TRUE;
4560 		ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED);
4561 	}
4562 	mutex_exit(&state->id_sched_lock);
4563 }
4564 
4565 /*
4566  * The gld_sched() in ibd_async_work() does the work for us.
4567  */
4568 static void
4569 ibd_async_txsched(ibd_state_t *state)
4570 {
4571 	mutex_enter(&state->id_sched_lock);
4572 	state->id_sched_queued = B_FALSE;
4573 	mutex_exit(&state->id_sched_lock);
4574 }
4575 
4576 /*
4577  * Release one or more chained send wqes back into free list.
4578  */
4579 static void
4580 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe,
4581     boolean_t send_context)
4582 {
4583 	boolean_t call_gld_sched = B_FALSE;
4584 
4585 	/*
4586 	 * Add back on Tx list for reuse.
4587 	 */
4588 	lswqe->swqe_next = NULL;
4589 	mutex_enter(&state->id_tx_list.dl_mutex);
4590 	if (state->id_tx_list.dl_pending_sends) {
4591 		state->id_tx_list.dl_pending_sends = B_FALSE;
4592 		call_gld_sched = B_TRUE;
4593 	}
4594 	if (state->id_tx_list.dl_head == NULL) {
4595 		state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe);
4596 	} else {
4597 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe);
4598 	}
4599 	state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe);
4600 	mutex_exit(&state->id_tx_list.dl_mutex);
4601 
4602 	/*
4603 	 * See comments in ibd_tx_sched(); make sure not to call
4604 	 * gld_sched() if we are in ibd_send() context.
4605 	 */
4606 	if (call_gld_sched)
4607 		if ((ibd_txcomp_poll == 0) && (!send_context))
4608 			gld_sched(state->id_macinfo);
4609 		else
4610 			ibd_tx_sched(state);
4611 }
4612 
4613 /*
4614  * Acquire a number of chained send wqe's from the free list. Returns the
4615  * number of wqe's actually allocated, and pointers to the first and last
4616  * in the chain.
4617  */
4618 static int
4619 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe,
4620     int number)
4621 {
4622 	int numwqe = number;
4623 	ibd_swqe_t *node, *wqes;
4624 
4625 	/*
4626 	 * Check and reclaim some of the completed Tx requests.
4627 	 * If someone else is already in this code and pulling Tx
4628 	 * completions, no need to poll, since the current lock holder
4629 	 * will do the work anyway. Normally, we poll for completions
4630 	 * every few Tx attempts, but if we are short on Tx descriptors,
4631 	 * we always try to poll.
4632 	 */
4633 	if ((ibd_txcomp_poll == 1) &&
4634 	    (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) ==
4635 	    0) || state->id_tx_list.dl_pending_sends) &&
4636 	    (mutex_tryenter(&state->id_txcomp_lock) != 0)) {
4637 		DPRINT(10, "ibd_send : polling");
4638 		ibd_poll_compq(state, state->id_scq_hdl);
4639 		mutex_exit(&state->id_txcomp_lock);
4640 	}
4641 
4642 	/*
4643 	 * Grab required transmit wqes.
4644 	 */
4645 	mutex_enter(&state->id_tx_list.dl_mutex);
4646 	node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head);
4647 	while ((node != NULL) && (numwqe-- > 1))
4648 		node = WQE_TO_SWQE(node->swqe_next);
4649 
4650 	/*
4651 	 * If we did not find the number we were looking for, flag no resource.
4652 	 * Adjust list appropriately in either case.
4653 	 */
4654 	if (numwqe != 0) {
4655 		state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL;
4656 		state->id_tx_list.dl_pending_sends = B_TRUE;
4657 		mutex_exit(&state->id_tx_list.dl_mutex);
4658 		DPRINT(5, "ibd_acquire_swqes: out of Tx wqe");
4659 		atomic_add_64(&state->id_tx_short, 1);
4660 		if (ibd_txcomp_poll == 1) {
4661 			/*
4662 			 * Arrange for a future gld_sched(). Note that when
4663 			 * the Tx is retried after a little bit, it will
4664 			 * surely poll the completion queue above.
4665 			 */
4666 			ibd_tx_sched(state);
4667 		}
4668 	} else {
4669 		state->id_tx_list.dl_head = node->swqe_next;
4670 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node))
4671 			state->id_tx_list.dl_tail = NULL;
4672 		mutex_exit(&state->id_tx_list.dl_mutex);
4673 	}
4674 
4675 	/*
4676 	 * Set return parameters.
4677 	 */
4678 	*fswqe = wqes;
4679 	*lswqe = node;
4680 	return (number - numwqe);
4681 }
4682 
4683 typedef struct ibd_mpack_s {
4684 	ibd_swqe_t	*ip_swqe;
4685 	uint32_t	ip_start, ip_stuff, ip_flags;
4686 	ibd_ace_t	*ip_ace;
4687 	boolean_t	ip_copy;
4688 	boolean_t	ip_noresources;
4689 	int		ip_segs;
4690 	ibt_mr_hdl_t	ip_mhdl[IBD_MDTMAX_SEGS + 1];
4691 	ibt_mr_desc_t	ip_mdsc[IBD_MDTMAX_SEGS + 1];
4692 } ibd_mpack_t;
4693 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s))
4694 
4695 static void
4696 ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info)
4697 {
4698 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4699 	ibd_mpack_t *ptx = (ibd_mpack_t *)cookie;
4700 	ibd_ace_t *ace = ptx->ip_ace;
4701 	ibd_swqe_t *wqes, *node = ptx->ip_swqe;
4702 	boolean_t docopy = ptx->ip_copy;
4703 	uchar_t *pptr;
4704 	int i, pktsize, seglen, seg = 0;
4705 
4706 	/*
4707 	 * Snag the next wqe before we post this one, since it could complete
4708 	 * very fast and the wqe could get put at the end of the list,
4709 	 * corrupting our chain. Set up for the next packet.
4710 	 */
4711 	wqes = WQE_TO_SWQE(node->swqe_next);
4712 	ptx->ip_swqe = wqes;
4713 
4714 	IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff,
4715 	    ptx->ip_flags);
4716 	node->w_ahandle = ace;
4717 	node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
4718 
4719 	if (docopy) {
4720 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
4721 		pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
4722 		pktsize = seglen = PDESC_HDRL(dl_pkt_info);
4723 		if (seglen > 0) {
4724 			bcopy(dl_pkt_info->hdr_rptr, pptr, seglen);
4725 			pptr += seglen;
4726 		}
4727 		for (; seg < dl_pkt_info->pld_cnt; seg++)
4728 			if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) {
4729 				bcopy(dl_pkt_info->pld_ary[seg].pld_rptr,
4730 				    pptr, seglen);
4731 				pptr += seglen;
4732 				pktsize += seglen;
4733 			}
4734 		node->w_swr.wr_nds = 1;
4735 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
4736 	} else {
4737 		seglen = PDESC_HDRL(dl_pkt_info);
4738 		if (seglen > 0) {
4739 			node->w_smblk_sgl[seg].ds_va =
4740 			    (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr;
4741 			node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey;
4742 			node->w_smblk_sgl[seg].ds_len = seglen;
4743 			seg++;
4744 		}
4745 		for (i = 0; i < dl_pkt_info->pld_cnt; i++) {
4746 			if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) {
4747 				node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t)
4748 				    (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr;
4749 				node->w_smblk_sgl[seg].ds_key =
4750 				    ptx->ip_mdsc[dl_pkt_info->
4751 					pld_ary[i].pld_pbuf_idx + 1].md_lkey;
4752 				node->w_smblk_sgl[seg].ds_len = seglen;
4753 				seg++;
4754 			}
4755 		}
4756 		node->w_swr.wr_sgl = node->w_smblk_sgl;
4757 		node->w_swr.wr_nds = seg;
4758 	}
4759 
4760 	if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) !=
4761 	    IBT_SUCCESS) {
4762 		/*
4763 		 * We never expect a failure here. But handle it, just in case.
4764 		 * If this is not the last packet, there are no problems; if
4765 		 * it is the last packet and the previous ones have not been
4766 		 * transmitted yet by the hardware, in the registration case,
4767 		 * the hardware might transmit garbage since we will be
4768 		 * freemsg'ing. The AH is still safe.
4769 		 */
4770 		DPRINT(5, "ibd_mdt_txone: posting failed");
4771 		ibd_tx_cleanup(state, node, B_TRUE);
4772 	}
4773 }
4774 
4775 static int
4776 ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie)
4777 {
4778 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4779 	multidata_t *dlmdp = mmd_getmultidata(mp);
4780 	ibd_mpack_t *mdinfo;
4781 	mbufinfo_t bufinfo, *binfo = &bufinfo;
4782 	pattrinfo_t attr_info;
4783 	uchar_t *dlap;
4784 	ibt_mr_attr_t mem_attr;
4785 	ibd_swqe_t *wqes, *node;
4786 	ipoib_mac_t *dest;
4787 	size_t hsize, psize = 0;
4788 	int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL);
4789 	int i, ret;
4790 	uint32_t end, value;
4791 	boolean_t noresources = B_FALSE;
4792 
4793 	ASSERT(DB_TYPE(mp) == M_MULTIDATA);
4794 	ASSERT(mp->b_cont == NULL);
4795 
4796 	if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0)
4797 		return (0);
4798 	else if (numwqes != numpackets)
4799 		noresources = B_TRUE;
4800 
4801 	DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node);
4802 
4803 	/*
4804 	 * Allocate the cookie that will be passed to subsequent packet
4805 	 * transmit and post_mdt calls by GLD. We can not sleep, so if
4806 	 * there is no memory, just tell GLD to drop the entire MDT message.
4807 	 */
4808 	if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) {
4809 		ibd_release_swqes(state, wqes, node, B_TRUE);
4810 		return (-1);
4811 	}
4812 	*cookie = (void *)mdinfo;
4813 	mdinfo->ip_noresources = noresources;
4814 
4815 	/*
4816 	 * Walk Global Attributes. If TCP failed to provide destination
4817 	 * information, or some interposing module removed the information,
4818 	 * fail the entire message.
4819 	 */
4820 	attr_info.type = PATTR_DSTADDRSAP;
4821 	if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) {
4822 		ibd_release_swqes(state, wqes, node, B_TRUE);
4823 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4824 		return (-1);
4825 	}
4826 	dlap = ((pattr_addr_t *)attr_info.buf)->addr;
4827 	dest = (ipoib_mac_t *)dlap;
4828 
4829 	/*
4830 	 * Get the AH for this destination, incrementing the posted
4831 	 * reference count properly.
4832 	 */
4833 	if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret,
4834 	    numwqes)) == NULL) {
4835 		ibd_release_swqes(state, wqes, node, B_TRUE);
4836 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4837 		return ((ret == GLD_FAILURE) ? -1 : 0);
4838 	}
4839 
4840 	/*
4841 	 * Depending on how costly it is to copy vs register, we try to
4842 	 * register, falling back on copying if we fail.
4843 	 */
4844 	mmd_getregions(dlmdp, &bufinfo);
4845 	hsize = binfo->hbuf_wptr - binfo->hbuf_rptr;
4846 	for (i = 0; i < binfo->pbuf_cnt; i++)
4847 		psize += (binfo->pbuf_ary[i].pbuf_wptr -
4848 		    binfo->pbuf_ary[i].pbuf_rptr);
4849 	if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) {
4850 		mdinfo->ip_segs = i + 1;
4851 		if (hsize != 0) {
4852 			mem_attr.mr_as = NULL;
4853 			mem_attr.mr_flags = IBT_MR_NOSLEEP;
4854 			mem_attr.mr_vaddr =
4855 			    (uint64_t)(uintptr_t)binfo->hbuf_rptr;
4856 			mem_attr.mr_len = hsize;
4857 			if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
4858 			    &mem_attr, &mdinfo->ip_mhdl[0],
4859 			    &mdinfo->ip_mdsc[0]) != IBT_SUCCESS)
4860 				goto ibd_mdt_copy;
4861 			DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize);
4862 		}
4863 		for (i = 0; i < binfo->pbuf_cnt; i++) {
4864 			if ((psize = (binfo->pbuf_ary[i].pbuf_wptr -
4865 			    binfo->pbuf_ary[i].pbuf_rptr)) != 0) {
4866 				mem_attr.mr_as = NULL;
4867 				mem_attr.mr_flags = IBT_MR_NOSLEEP;
4868 				mem_attr.mr_vaddr = (uint64_t)(uintptr_t)
4869 				    binfo->pbuf_ary[i].pbuf_rptr;
4870 				mem_attr.mr_len = psize;
4871 				if (ibt_register_mr(state->id_hca_hdl,
4872 				    state->id_pd_hdl, &mem_attr,
4873 				    &mdinfo->ip_mhdl[i + 1],
4874 				    &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) {
4875 					for (; i >= 0; i--) {
4876 						(void) ibt_deregister_mr(
4877 						    state->id_hca_hdl,
4878 						    mdinfo->ip_mhdl[i]);
4879 					}
4880 					goto ibd_mdt_copy;
4881 				}
4882 				DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize);
4883 			}
4884 		}
4885 
4886 		mdinfo->ip_copy = B_FALSE;
4887 
4888 		/*
4889 		 * All the deregistration must happen once the last swqe
4890 		 * completes.
4891 		 */
4892 		node->swqe_im_mblk = mp;
4893 		node->w_mdtinfo = mdinfo;
4894 		DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node);
4895 	} else {
4896 ibd_mdt_copy:
4897 		mdinfo->ip_copy = B_TRUE;
4898 	}
4899 
4900 	/*
4901 	 * Do checksum related work.
4902 	 */
4903 	IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff,
4904 	    &end, &value, &mdinfo->ip_flags);
4905 
4906 	mdinfo->ip_swqe = wqes;
4907 	return (numwqes);
4908 }
4909 
4910 /* ARGSUSED */
4911 static void
4912 ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie)
4913 {
4914 	ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie;
4915 
4916 	if (mdinfo->ip_copy) {
4917 		if (!mdinfo->ip_noresources)
4918 			freemsg(mp);
4919 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
4920 	}
4921 }
4922 
4923 /*
4924  * GLD entry point for transmitting a datagram.
4925  * The passed in packet has this format:
4926  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
4927  */
4928 static int
4929 ibd_send(gld_mac_info_t *macinfo, mblk_t *mp)
4930 {
4931 	ibt_status_t ibt_status;
4932 	ibt_mr_attr_t mem_attr;
4933 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
4934 	ibd_ace_t *ace;
4935 	ibd_swqe_t *node;
4936 	ipoib_mac_t *dest;
4937 	ipoib_ptxhdr_t *ipibp;
4938 	ip6_t *ip6h;
4939 	mblk_t *nmp = mp;
4940 	uint_t pktsize;
4941 	size_t	blksize;
4942 	uchar_t *bufp;
4943 	int i, ret, len, nmblks = 1;
4944 	boolean_t dofree = B_TRUE;
4945 
4946 	if (ibd_acquire_swqes(state, &node, &node, 1) == 0)
4947 		return (GLD_NORESOURCES);
4948 
4949 	/*
4950 	 * Obtain an address handle for the destination.
4951 	 */
4952 	dest = (ipoib_mac_t *)mp->b_rptr;
4953 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
4954 		node->w_ahandle = ace;
4955 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
4956 	} else {
4957 		DPRINT(5,
4958 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
4959 		    ((ret == GLD_FAILURE) ? "failed" : "queued"),
4960 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
4961 		    htonl(dest->ipoib_gidpref[1]),
4962 		    htonl(dest->ipoib_gidsuff[0]),
4963 		    htonl(dest->ipoib_gidsuff[1]));
4964 		node->w_ahandle = NULL;
4965 		goto ibd_send_fail;
4966 	}
4967 
4968 	/*
4969 	 * For ND6 packets, padding is at the front of the source lladdr.
4970 	 * Insert the padding at front.
4971 	 */
4972 	ipibp = (ipoib_ptxhdr_t *)mp->b_rptr;
4973 	if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) {
4974 		if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) {
4975 			if (!pullupmsg(mp, IPV6_HDR_LEN +
4976 			    sizeof (ipoib_ptxhdr_t))) {
4977 				DPRINT(10, "ibd_send: pullupmsg failure ");
4978 				ret = GLD_FAILURE;
4979 				goto ibd_send_fail;
4980 			}
4981 		}
4982 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t));
4983 		len = ntohs(ip6h->ip6_plen);
4984 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
4985 			if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) +
4986 			    IPV6_HDR_LEN + len) {
4987 				if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) +
4988 				    IPV6_HDR_LEN + len)) {
4989 					DPRINT(10, "ibd_send: pullupmsg "
4990 					    "failure ");
4991 					ret = GLD_FAILURE;
4992 					goto ibd_send_fail;
4993 				}
4994 			}
4995 			/* LINTED: E_CONSTANT_CONDITION */
4996 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
4997 		}
4998 	}
4999 
5000 	mp->b_rptr += IPOIB_ADDRL;
5001 	while (((nmp = nmp->b_cont) != NULL) &&
5002 	    (++nmblks < (state->id_max_sqseg + 1)));
5003 	pktsize = msgsize(mp);
5004 	if (pktsize > state->id_mtu) {
5005 		ret = GLD_BADARG;
5006 		goto ibd_send_fail;
5007 	}
5008 
5009 	/*
5010 	 * Do checksum related work.
5011 	 */
5012 	IBD_CKSUM_SEND(mp);
5013 
5014 	/*
5015 	 * Copy the data to preregistered buffers, or register the buffer.
5016 	 */
5017 	if ((nmblks <= state->id_max_sqseg) &&
5018 	    (pktsize > IBD_TX_COPY_THRESHOLD)) {
5019 		for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) {
5020 			mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr;
5021 			mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr;
5022 			mem_attr.mr_as = NULL;
5023 			mem_attr.mr_flags = IBT_MR_NOSLEEP;
5024 			ibt_status = ibt_register_mr(state->id_hca_hdl,
5025 			    state->id_pd_hdl, &mem_attr,
5026 			    &node->w_smblkbuf[i].im_mr_hdl,
5027 			    &node->w_smblkbuf[i].im_mr_desc);
5028 			if (ibt_status != IBT_SUCCESS) {
5029 				/*
5030 				 * We do not expect any error other than
5031 				 * IBT_INSUFF_RESOURCE.
5032 				 */
5033 				if (ibt_status != IBT_INSUFF_RESOURCE)
5034 				    DPRINT(10, "ibd_send:%d\n",
5035 				    "failed in ibt_register_mem()",
5036 				    ibt_status);
5037 				DPRINT(5, "ibd_send: registration failed");
5038 				node->w_swr.wr_nds = i;
5039 				/*
5040 				 * Deregister already registered memory;
5041 				 * fallback to copying the mblk.
5042 				 */
5043 				ibd_deregister_mr(state, node);
5044 				goto ibd_copy_path;
5045 			}
5046 			node->w_smblk_sgl[i].ds_va =
5047 			    (ib_vaddr_t)(uintptr_t)nmp->b_rptr;
5048 			node->w_smblk_sgl[i].ds_key =
5049 			    node->w_smblkbuf[i].im_mr_desc.md_lkey;
5050 			node->w_smblk_sgl[i].ds_len =
5051 			    nmp->b_wptr - nmp->b_rptr;
5052 		}
5053 		node->swqe_im_mblk = mp;
5054 		node->w_swr.wr_sgl = node->w_smblk_sgl;
5055 		node->w_swr.wr_nds = nmblks;
5056 		dofree = B_FALSE;
5057 	} else {
5058 ibd_copy_path:
5059 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5060 		node->w_swr.wr_nds = 1;
5061 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5062 
5063 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5064 		for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
5065 			blksize = MBLKL(nmp);
5066 			bcopy(nmp->b_rptr, bufp, blksize);
5067 			bufp += blksize;
5068 		}
5069 	}
5070 
5071 	/*
5072 	 * Queue the wqe to hardware.
5073 	 */
5074 	ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL);
5075 	if (ibt_status != IBT_SUCCESS) {
5076 		/*
5077 		 * We should not fail here; but just in case we do, we
5078 		 * tell GLD about this error.
5079 		 */
5080 		ret = GLD_FAILURE;
5081 		DPRINT(5, "ibd_send: posting failed");
5082 		goto ibd_send_fail;
5083 	}
5084 
5085 	DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X",
5086 	    INCTXPACK, htonl(ace->ac_mac.ipoib_qpn),
5087 	    htonl(ace->ac_mac.ipoib_gidpref[0]),
5088 	    htonl(ace->ac_mac.ipoib_gidpref[1]),
5089 	    htonl(ace->ac_mac.ipoib_gidsuff[0]),
5090 	    htonl(ace->ac_mac.ipoib_gidsuff[1]));
5091 
5092 	if (dofree)
5093 		freemsg(mp);
5094 
5095 	return (GLD_SUCCESS);
5096 
5097 ibd_send_fail:
5098 	ibd_tx_cleanup(state, node, B_TRUE);
5099 	return (ret);
5100 }
5101 
5102 /*
5103  * GLD entry point for handling interrupts. When using combined CQ,
5104  * this handles Tx and Rx completions. With separate CQs, this handles
5105  * only Rx completions.
5106  */
5107 static uint_t
5108 ibd_intr(gld_mac_info_t *macinfo)
5109 {
5110 	ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private;
5111 
5112 	/*
5113 	 * Poll for completed entries; the CQ will not interrupt any
5114 	 * more for incoming (or transmitted) packets.
5115 	 */
5116 	ibd_poll_compq(state, state->id_rcq_hdl);
5117 
5118 	/*
5119 	 * Now enable CQ notifications; all packets that arrive now
5120 	 * (or complete transmission) will cause new interrupts.
5121 	 */
5122 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
5123 	    IBT_SUCCESS) {
5124 		/*
5125 		 * We do not expect a failure here.
5126 		 */
5127 		DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5128 	}
5129 
5130 	/*
5131 	 * Repoll to catch all packets that might have arrived after
5132 	 * we finished the first poll loop and before interrupts got
5133 	 * armed.
5134 	 */
5135 	ibd_poll_compq(state, state->id_rcq_hdl);
5136 
5137 	return (DDI_INTR_CLAIMED);
5138 }
5139 
5140 /*
5141  * Common code for interrupt handling as well as for polling
5142  * for all completed wqe's while detaching.
5143  */
5144 static void
5145 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5146 {
5147 	ibd_wqe_t *wqe;
5148 	ibt_wc_t *wc, *wcs;
5149 	uint_t numwcs;
5150 	int i;
5151 
5152 	/*
5153 	 * In some cases (eg detaching), this code can be invoked on
5154 	 * any cpu after disabling cq notification (thus no concurrency
5155 	 * exists). Apart from that, the following applies normally:
5156 	 * The receive completion handling is always on the Rx interrupt
5157 	 * cpu. Transmit completion handling could be from any cpu if
5158 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5159 	 * is interrupt driven. Combined completion handling is always
5160 	 * on the interrupt cpu. Thus, lock accordingly and use the
5161 	 * proper completion array.
5162 	 */
5163 	if (cq_hdl == state->id_rcq_hdl)
5164 		wcs = state->id_wcs;
5165 	else
5166 		wcs = state->id_txwcs;
5167 
5168 	while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) {
5169 
5170 		for (i = 0, wc = wcs; i < numwcs; i++, wc++) {
5171 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5172 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5173 			    (wqe->w_type == IBD_WQE_RECV));
5174 			if (wc->wc_status != IBT_WC_SUCCESS) {
5175 				/*
5176 				 * Channel being torn down.
5177 				 */
5178 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5179 					DPRINT(5, "ibd_intr: flush error");
5180 					/*
5181 					 * Only invoke the Tx handler to
5182 					 * release possibly held resources
5183 					 * like AH refcount etc. Can not
5184 					 * invoke Rx handler because it might
5185 					 * try adding buffers to the Rx pool
5186 					 * when we are trying to deinitialize.
5187 					 */
5188 					if (wqe->w_type == IBD_WQE_RECV)
5189 						continue;
5190 				} else {
5191 					DPRINT(10, "%s %d",
5192 					    "ibd_intr: Bad CQ status",
5193 					    wc->wc_status);
5194 				}
5195 			}
5196 			if (wqe->w_type == IBD_WQE_SEND)
5197 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe),
5198 				    B_FALSE);
5199 			else
5200 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5201 		}
5202 	}
5203 }
5204 
5205 /*
5206  * Deregister the mr associated with a given mblk.
5207  */
5208 static void
5209 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe)
5210 {
5211 	int i;
5212 
5213 	DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe,
5214 	    swqe->w_swr.wr_nds);
5215 	/*
5216 	 * If this is an MDT case, process accordingly.
5217 	 */
5218 	if (swqe->w_mdtinfo != NULL) {
5219 		ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo;
5220 
5221 		for (i = 0; i < mdinfo->ip_segs; i++)
5222 			if ((mdinfo->ip_mhdl[i] != 0) &&
5223 			    (ibt_deregister_mr(state->id_hca_hdl,
5224 			    mdinfo->ip_mhdl[i]) != IBT_SUCCESS))
5225 				DPRINT(10, "MDT deregistration failed\n");
5226 		ASSERT(!mdinfo->ip_copy);
5227 		kmem_free(mdinfo, sizeof (ibd_mpack_t));
5228 		swqe->w_mdtinfo = NULL;
5229 		return;
5230 	}
5231 
5232 	for (i = 0; i < swqe->w_swr.wr_nds; i++) {
5233 		if (ibt_deregister_mr(state->id_hca_hdl,
5234 		    swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) {
5235 			/*
5236 			 * We do not expect any errors here.
5237 			 */
5238 			DPRINT(10, "failed in ibt_deregister_mem()\n");
5239 		}
5240 	}
5241 }
5242 
5243 /*
5244  * Common code that deals with clean ups after a successful or
5245  * erroneous transmission attempt.
5246  */
5247 static void
5248 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context)
5249 {
5250 	ibd_ace_t *ace = swqe->w_ahandle;
5251 
5252 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
5253 
5254 	/*
5255 	 * If this was a dynamic registration in ibd_send() or in MDT,
5256 	 * deregister now.
5257 	 */
5258 	if (swqe->swqe_im_mblk != NULL) {
5259 		ibd_deregister_mr(state, swqe);
5260 		freemsg(swqe->swqe_im_mblk);
5261 		swqe->swqe_im_mblk = NULL;
5262 	}
5263 
5264 	/*
5265 	 * Drop the reference count on the AH; it can be reused
5266 	 * now for a different destination if there are no more
5267 	 * posted sends that will use it. This can be eliminated
5268 	 * if we can always associate each Tx buffer with an AH.
5269 	 * The ace can be null if we are cleaning up from the
5270 	 * ibd_send() error path.
5271 	 */
5272 	if (ace != NULL) {
5273 		/*
5274 		 * The recycling logic can be eliminated from here
5275 		 * and put into the async thread if we create another
5276 		 * list to hold ACE's for unjoined mcg's.
5277 		 */
5278 		if (DEC_REF_DO_CYCLE(ace)) {
5279 			ibd_mce_t *mce;
5280 
5281 			/*
5282 			 * Check with the lock taken: we decremented
5283 			 * reference count without the lock, and some
5284 			 * transmitter might alreay have bumped the
5285 			 * reference count (possible in case of multicast
5286 			 * disable when we leave the AH on the active
5287 			 * list). If not still 0, get out, leaving the
5288 			 * recycle bit intact.
5289 			 *
5290 			 * Atomically transition the AH from active
5291 			 * to free list, and queue a work request to
5292 			 * leave the group and destroy the mce. No
5293 			 * transmitter can be looking at the AH or
5294 			 * the MCE in between, since we have the
5295 			 * ac_mutex lock. In the SendOnly reap case,
5296 			 * it is not neccesary to hold the ac_mutex
5297 			 * and recheck the ref count (since the AH was
5298 			 * taken off the active list), we just do it
5299 			 * to have uniform processing with the Full
5300 			 * reap case.
5301 			 */
5302 			mutex_enter(&state->id_ac_mutex);
5303 			mce = ace->ac_mce;
5304 			if (GET_REF_CYCLE(ace) == 0) {
5305 				CLEAR_REFCYCLE(ace);
5306 				/*
5307 				 * Identify the case of fullmember reap as
5308 				 * opposed to mcg trap reap. Also, port up
5309 				 * might set ac_mce to NULL to indicate Tx
5310 				 * cleanup should do no more than put the
5311 				 * AH in the free list (see ibd_async_link).
5312 				 */
5313 				if (mce != NULL) {
5314 					ace->ac_mce = NULL;
5315 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
5316 					/*
5317 					 * mc_req was initialized at mce
5318 					 * creation time.
5319 					 */
5320 					ibd_queue_work_slot(state,
5321 					    &mce->mc_req, ASYNC_REAP);
5322 				}
5323 				IBD_ACACHE_INSERT_FREE(state, ace);
5324 			}
5325 			mutex_exit(&state->id_ac_mutex);
5326 		}
5327 	}
5328 
5329 	/*
5330 	 * Release the send wqe for reuse.
5331 	 */
5332 	ibd_release_swqes(state, swqe, swqe, send_context);
5333 }
5334 
5335 /*
5336  * Processing to be done after receipt of a packet; hand off to GLD
5337  * in the format expected by GLD.
5338  * The recvd packet has this format: 2b sap :: 00 :: data.
5339  */
5340 static void
5341 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
5342 {
5343 	ipoib_pgrh_t *pgrh;
5344 	mblk_t *mp;
5345 	ipoib_hdr_t *ipibp;
5346 	ip6_t *ip6h;
5347 	int rxcnt, len;
5348 
5349 	/*
5350 	 * Track number handed to upper layer, and number still
5351 	 * available to receive packets.
5352 	 */
5353 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
5354 	ASSERT(rxcnt >= 0);
5355 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
5356 
5357 	/*
5358 	 * Adjust write pointer depending on how much data came in.
5359 	 */
5360 	mp = rwqe->rwqe_im_mblk;
5361 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
5362 
5363 	/*
5364 	 * If the GRH is not valid, indicate to GLD by setting
5365 	 * the VerTcFlow field to 0. Else, update the pseudoGRH
5366 	 * so that GLD can determine the source mac of the packet.
5367 	 */
5368 	pgrh = (ipoib_pgrh_t *)mp->b_rptr;
5369 	if (wc->wc_flags & IBT_WC_GRH_PRESENT)
5370 		pgrh->ipoib_sqpn = htonl(wc->wc_qpn);
5371 	else
5372 		pgrh->ipoib_vertcflow = 0;
5373 
5374 	DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK);
5375 
5376 	/*
5377 	 * For ND6 packets, padding is at the front of the source/target
5378 	 * lladdr. However the inet6 layer is not aware of it, hence remove
5379 	 * the padding from such packets.
5380 	 */
5381 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
5382 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
5383 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
5384 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5385 			    sizeof (ipoib_hdr_t))) {
5386 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
5387 				freemsg(mp);
5388 				return;
5389 			}
5390 		}
5391 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
5392 		len = ntohs(ip6h->ip6_plen);
5393 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5394 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
5395 			    IPV6_HDR_LEN + len) {
5396 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
5397 				    IPV6_HDR_LEN + len)) {
5398 					DPRINT(10, "ibd_process_rx: pullupmsg"
5399 					    " failed");
5400 					freemsg(mp);
5401 					return;
5402 				}
5403 			}
5404 			/* LINTED: E_CONSTANT_CONDITION */
5405 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
5406 		}
5407 	}
5408 
5409 	/*
5410 	 * Hand off to service thread/GLD. When we have hardware that
5411 	 * does hardware checksum, we will pull the checksum from the
5412 	 * work completion structure here.
5413 	 * on interrupt cpu.
5414 	 */
5415 	ibd_send_up(state, mp);
5416 
5417 	/*
5418 	 * Possibly replenish the Rx pool if needed.
5419 	 */
5420 	if (rxcnt < IBD_RX_THRESHOLD) {
5421 		state->id_rx_short++;
5422 		if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) {
5423 			if (ibd_post_rwqe(state, rwqe, B_FALSE) ==
5424 			    DDI_FAILURE) {
5425 				ibd_free_rwqe(state, rwqe);
5426 				return;
5427 			}
5428 		}
5429 	}
5430 }
5431 
5432 /*
5433  * Callback code invoked from STREAMs when the recv data buffer is free
5434  * for recycling.
5435  */
5436 static void
5437 ibd_freemsg_cb(char *arg)
5438 {
5439 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
5440 	ibd_state_t *state = rwqe->w_state;
5441 
5442 	/*
5443 	 * If the wqe is being destructed, do not attempt recycling.
5444 	 */
5445 	if (rwqe->w_freeing_wqe == B_TRUE) {
5446 		DPRINT(6, "ibd_freemsg_cb: wqe being freed");
5447 		return;
5448 	}
5449 
5450 	/*
5451 	 * Upper layer has released held mblk.
5452 	 */
5453 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
5454 
5455 	if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) {
5456 		/*
5457 		 * There are already enough buffers on the Rx ring.
5458 		 * Free this one up.
5459 		 */
5460 		rwqe->rwqe_im_mblk = NULL;
5461 		ibd_delete_rwqe(state, rwqe);
5462 		ibd_free_rwqe(state, rwqe);
5463 		DPRINT(6, "ibd_freemsg_cb: free up wqe");
5464 	} else {
5465 		rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
5466 		    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
5467 		if (rwqe->rwqe_im_mblk == NULL) {
5468 			ibd_delete_rwqe(state, rwqe);
5469 			ibd_free_rwqe(state, rwqe);
5470 			DPRINT(6, "ibd_freemsg_cb: desballoc failed");
5471 			return;
5472 		}
5473 
5474 		/*
5475 		 * Post back to h/w. We could actually have more than
5476 		 * id_num_rwqe WQEs on the list if there were multiple
5477 		 * ibd_freemsg_cb() calls outstanding (since the lock is
5478 		 * not held the entire time). This will start getting
5479 		 * corrected over subsequent ibd_freemsg_cb() calls.
5480 		 */
5481 		if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) {
5482 			ibd_delete_rwqe(state, rwqe);
5483 			ibd_free_rwqe(state, rwqe);
5484 			return;
5485 		}
5486 	}
5487 }
5488 
5489 #ifdef RUN_PERFORMANCE
5490 
5491 /*
5492  * To run the performance test, first do the "ifconfig ibdN plumb" on
5493  * the Rx and Tx side. Then use mdb -kw to tweak the following variables:
5494  * ibd_performance=1.
5495  * ibd_receiver=1 on Rx side.
5496  * ibd_sender=1 on Tx side.
5497  * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update
5498  * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will
5499  * make it drop into a 1 minute loop waiting for packets. An
5500  * ifconfig/unplumb on the Tx will cause it to send packets to Rx.
5501  */
5502 
5503 #define	IBD_NUM_UNSIGNAL	ibd_num_unsignal
5504 #define	IBD_TX_PKTSIZE		ibd_tx_pktsize
5505 #define	IBD_TX_DATASIZE		ibd_tx_datasize
5506 
5507 static ibd_swqe_t **swqes;
5508 static ibt_wc_t *wcs;
5509 
5510 /*
5511  * Set these on Rx and Tx side to do performance run.
5512  */
5513 static int ibd_performance = 0;
5514 static int ibd_receiver = 0;
5515 static int ibd_sender = 0;
5516 static ipoib_mac_t ibd_dest;
5517 
5518 /*
5519  * Interrupt coalescing is achieved by asking for a completion intr
5520  * only every ibd_num_unsignal'th packet.
5521  */
5522 static int ibd_num_unsignal = 8;
5523 
5524 /*
5525  * How big is each packet?
5526  */
5527 static int ibd_tx_pktsize = 2048;
5528 
5529 /*
5530  * Total data size to be transmitted.
5531  */
5532 static int ibd_tx_datasize = 512*1024*1024;
5533 
5534 static volatile boolean_t cq_handler_ran = B_FALSE;
5535 static volatile int num_completions;
5536 
5537 /* ARGSUSED */
5538 static void
5539 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg)
5540 {
5541 	ibd_state_t *state = (ibd_state_t *)arg;
5542 	ibt_cq_hdl_t cqhdl;
5543 	ibd_wqe_t *wqe;
5544 	uint_t polled, i;
5545 	boolean_t cq_enabled = B_FALSE;
5546 
5547 	if (ibd_receiver == 1)
5548 		cqhdl = state->id_rcq_hdl;
5549 	else
5550 		cqhdl = state->id_scq_hdl;
5551 
5552 	/*
5553 	 * Mark the handler as having run and possibly freed up some
5554 	 * slots. Blocked sends can be retried.
5555 	 */
5556 	cq_handler_ran = B_TRUE;
5557 
5558 repoll:
5559 	while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) ==
5560 	    IBT_SUCCESS) {
5561 		num_completions += polled;
5562 		if (ibd_receiver == 1) {
5563 			/*
5564 			 * We can immediately recycle the buffer. No
5565 			 * need to pass up to any IP layer ...
5566 			 */
5567 			for (i = 0; i < polled; i++) {
5568 				wqe = (ibd_wqe_t *)wcs[i].wc_id;
5569 				(void) ibt_post_recv(state->id_chnl_hdl,
5570 				    &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL);
5571 			}
5572 		}
5573 	}
5574 
5575 	/*
5576 	 * If we just repolled, we are done; exit.
5577 	 */
5578 	if (cq_enabled)
5579 		return;
5580 
5581 	/*
5582 	 * Enable CQ.
5583 	 */
5584 	if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
5585 		/*
5586 		 * We do not expect a failure here.
5587 		 */
5588 		cmn_err(CE_CONT, "ibd_perf_handler: notify failed");
5589 	}
5590 	cq_enabled = B_TRUE;
5591 
5592 	/*
5593 	 * Repoll for packets that came in after we finished previous
5594 	 * poll loop but before we turned on notifications.
5595 	 */
5596 	goto repoll;
5597 }
5598 
5599 static void
5600 ibd_perf_tx(ibd_state_t *state)
5601 {
5602 	ibt_mr_hdl_t mrhdl;
5603 	ibt_mr_desc_t mrdesc;
5604 	ibt_mr_attr_t mem_attr;
5605 	ibt_status_t stat;
5606 	ibd_ace_t *ace = NULL;
5607 	ibd_swqe_t *node;
5608 	uchar_t *sendbuf;
5609 	longlong_t stime, etime;
5610 	longlong_t sspin, espin, tspin = 0;
5611 	int i, reps, packets;
5612 
5613 	cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X",
5614 	    htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]),
5615 	    htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]),
5616 	    htonl(ibd_dest.ipoib_gidsuff[1]));
5617 	if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) ||
5618 	    (ibd_dest.ipoib_gidpref[1] == 0)) {
5619 		cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address");
5620 		return;
5621 	}
5622 
5623 	packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE);
5624 	reps = (packets / IBD_NUM_SWQE);
5625 
5626 	cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE);
5627 	cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE);
5628 	cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets);
5629 	cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE);
5630 	cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL);
5631 	if ((packets % IBD_NUM_UNSIGNAL) != 0) {
5632 		/*
5633 		 * This is required to ensure the last packet will trigger
5634 		 * a CQ handler callback, thus we can spin waiting fot all
5635 		 * packets to be received.
5636 		 */
5637 		cmn_err(CE_CONT,
5638 		    "ibd_perf_tx: #Packets not multiple of Signal Grp size");
5639 		return;
5640 	}
5641 	num_completions = 0;
5642 
5643 	swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE,
5644 	    KM_NOSLEEP);
5645 	if (swqes == NULL) {
5646 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5647 		return;
5648 	}
5649 
5650 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5651 	if (wcs == NULL) {
5652 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5653 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5654 		return;
5655 	}
5656 
5657 	/*
5658 	 * Get the ud_dest for the destination.
5659 	 */
5660 	ibd_async_acache(state, &ibd_dest);
5661 	mutex_enter(&state->id_ac_mutex);
5662 	ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0);
5663 	mutex_exit(&state->id_ac_mutex);
5664 	if (ace == NULL) {
5665 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5666 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5667 		cmn_err(CE_CONT, "ibd_perf_tx: no AH");
5668 		return;
5669 	}
5670 
5671 	/*
5672 	 * Set up the send buffer.
5673 	 */
5674 	sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP);
5675 	if (sendbuf == NULL) {
5676 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5677 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5678 		cmn_err(CE_CONT, "ibd_perf_tx: no send buffer");
5679 		return;
5680 	}
5681 
5682 	/*
5683 	 * This buffer can be used in the case when we want to
5684 	 * send data from the same memory area over and over;
5685 	 * it might help in reducing memory traffic.
5686 	 */
5687 	mem_attr.mr_vaddr = (uint64_t)sendbuf;
5688 	mem_attr.mr_len = IBD_TX_PKTSIZE;
5689 	mem_attr.mr_as = NULL;
5690 	mem_attr.mr_flags = IBT_MR_NOSLEEP;
5691 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
5692 	    &mrhdl, &mrdesc) != IBT_SUCCESS) {
5693 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5694 		kmem_free(sendbuf, IBD_TX_PKTSIZE);
5695 		kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5696 		cmn_err(CE_CONT, "ibd_perf_tx: registration failed");
5697 		return;
5698 	}
5699 
5700 	/*
5701 	 * Allocate private send wqe's.
5702 	 */
5703 	for (i = 0; i < IBD_NUM_SWQE; i++) {
5704 		if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) {
5705 			kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5706 			kmem_free(sendbuf, IBD_TX_PKTSIZE);
5707 			kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5708 			cmn_err(CE_CONT, "ibd_alloc_swqe failure");
5709 			return;
5710 		}
5711 		node->w_ahandle = ace;
5712 #if 0
5713 		node->w_smblkbuf[0].im_mr_hdl = mrhdl;
5714 		node->w_smblkbuf[0].im_mr_desc = mrdesc;
5715 		node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf;
5716 		node->w_smblk_sgl[0].ds_key =
5717 		    node->w_smblkbuf[0].im_mr_desc.md_lkey;
5718 		node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE;
5719 		node->w_swr.wr_sgl = node->w_smblk_sgl;
5720 #else
5721 		node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE;
5722 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5723 #endif
5724 
5725 		/*
5726 		 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs
5727 		 * is marked to invoke the CQ handler. That is the only
5728 		 * way we come to know when the send queue can accept more
5729 		 * WRs.
5730 		 */
5731 		if (((i + 1) % IBD_NUM_UNSIGNAL) != 0)
5732 			node->w_swr.wr_flags = IBT_WR_NO_FLAGS;
5733 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5734 		node->w_swr.wr_nds = 1;
5735 
5736 		swqes[i] = node;
5737 	}
5738 
5739 	ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state);
5740 
5741 	/*
5742 	 * Post all the requests. We expect this stream of post's will
5743 	 * not overwhelm the hardware due to periodic completions and
5744 	 * pollings that happen out of ibd_perf_handler.
5745 	 * Post a set of requests, till the channel can accept; after
5746 	 * that, wait for the CQ handler to notify us that there is more
5747 	 * space.
5748 	 */
5749 	stime = gethrtime();
5750 	for (; reps > 0; reps--)
5751 		for (i = 0; i < IBD_NUM_SWQE; i++) {
5752 			node = swqes[i];
5753 retry:
5754 			if ((stat = ibt_post_send(state->id_chnl_hdl,
5755 			    &node->w_swr, 1, NULL)) != IBT_SUCCESS) {
5756 				if (stat == IBT_CHAN_FULL) {
5757 					/*
5758 					 * Spin till the CQ handler runs
5759 					 * and then try again.
5760 					 */
5761 					sspin = gethrtime();
5762 					while (!cq_handler_ran);
5763 					espin = gethrtime();
5764 					tspin += (espin - sspin);
5765 					cq_handler_ran = B_FALSE;
5766 					goto retry;
5767 				}
5768 				cmn_err(CE_CONT, "post failure %d/%d", stat, i);
5769 				goto done;
5770 			}
5771 		}
5772 
5773 done:
5774 	/*
5775 	 * We should really be snapshotting when we get the last
5776 	 * completion.
5777 	 */
5778 	while (num_completions != (packets / IBD_NUM_UNSIGNAL));
5779 	etime = gethrtime();
5780 
5781 	cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d",
5782 	    num_completions);
5783 	cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime));
5784 	cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin);
5785 
5786 	/*
5787 	 * Wait a sec for everything to get over.
5788 	 */
5789 	delay(drv_usectohz(2000000));
5790 
5791 	/*
5792 	 * Reset CQ handler to real one; free resources.
5793 	 */
5794 	if (ibd_separate_cqs == 0) {
5795 		ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state);
5796 	} else {
5797 		if (ibd_txcomp_poll == 0)
5798 			ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler,
5799 			    state);
5800 		else
5801 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5802 	}
5803 
5804 	for (i = 0; i < IBD_NUM_SWQE; i++)
5805 		ibd_free_swqe(state, swqes[i]);
5806 	(void) ibt_deregister_mr(state->id_hca_hdl, mrhdl);
5807 	kmem_free(sendbuf, IBD_TX_PKTSIZE);
5808 	kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5809 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5810 }
5811 
5812 static void
5813 ibd_perf_rx(ibd_state_t *state)
5814 {
5815 	wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP);
5816 	if (wcs == NULL) {
5817 		kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE);
5818 		cmn_err(CE_CONT, "ibd_perf_tx: no storage");
5819 		return;
5820 	}
5821 
5822 	/*
5823 	 * We do not need to allocate private recv wqe's. We will
5824 	 * just use the regular ones.
5825 	 */
5826 
5827 	num_completions = 0;
5828 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state);
5829 
5830 	/*
5831 	 * Delay for a minute for all the packets to come in from
5832 	 * transmitter.
5833 	 */
5834 	cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE);
5835 	delay(drv_usectohz(60000000));
5836 	cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions);
5837 
5838 	/*
5839 	 * Reset CQ handler to real one; free resources.
5840 	 */
5841 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5842 	kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL);
5843 }
5844 
5845 static void
5846 ibd_perf(ibd_state_t *state)
5847 {
5848 	if (ibd_performance == 0)
5849 		return;
5850 
5851 	if (ibd_receiver == 1) {
5852 		ibd_perf_rx(state);
5853 		return;
5854 	}
5855 
5856 	if (ibd_sender == 1) {
5857 		ibd_perf_tx(state);
5858 		return;
5859 	}
5860 }
5861 
5862 #endif /* RUN_PERFORMANCE */
5863