xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision e0e638160d72f8685f1481f6308bc368cd233c3f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <inet/tcp.h>		/* for tcph_t */
56 #include <netinet/icmp6.h>	/* for icmp6_t */
57 #include <sys/callb.h>
58 #include <sys/modhash.h>
59 
60 #include <sys/ib/clients/ibd/ibd.h>
61 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
62 #include <sys/note.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Per-interface tunables
69  *
70  * ibd_tx_copy_thresh
71  *     This sets the threshold at which ibd will attempt to do a bcopy of the
72  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
73  *     is restricted by various parameters, so setting of this value must be
74  *     made after careful considerations only.  For instance, IB HCAs currently
75  *     impose a relatively small limit (when compared to ethernet NICs) on the
76  *     length of the SGL for transmit. On the other hand, the ip stack could
77  *     send down mp chains that are quite long when LSO is enabled.
78  *
79  * ibd_num_swqe
80  *     Number of "send WQE" elements that will be allocated and used by ibd.
81  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
82  *     buffer in each of these send wqes must be taken into account. This
83  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
84  *     currently set to the same value of ibd_tx_copy_thresh, but may be
85  *     changed independently if needed).
86  *
87  * ibd_num_rwqe
88  *     Number of "receive WQE" elements that will be allocated and used by
89  *     ibd. This parameter is limited by the maximum channel size of the HCA.
90  *     Each buffer in the receive wqe will be of MTU size.
91  *
92  * ibd_num_lso_bufs
93  *     Number of "larger-than-MTU" copy buffers to use for cases when the
94  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
95  *     and too large to be used with regular MTU-sized copy buffers. It is
96  *     not recommended to tune this variable without understanding the
97  *     application environment and/or memory resources. The size of each of
98  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
99  *
100  * ibd_num_ah
101  *     Number of AH cache entries to allocate
102  *
103  * ibd_hash_size
104  *     Hash table size for the active AH list
105  *
106  * ibd_separate_cqs
107  * ibd_txcomp_poll
108  *     These boolean variables (1 or 0) may be used to tune the behavior of
109  *     ibd in managing the send and receive completion queues and in deciding
110  *     whether or not transmit completions should be polled or interrupt
111  *     driven (when the completion queues are separate). If both the completion
112  *     queues are interrupt driven, it may not be possible for the handlers to
113  *     be invoked concurrently, depending on how the interrupts are tied on
114  *     the PCI intr line.  Note that some combination of these two parameters
115  *     may not be meaningful (and therefore not allowed).
116  *
117  * ibd_tx_softintr
118  * ibd_rx_softintr
119  *     The softintr mechanism allows ibd to avoid event queue overflows if
120  *     the receive/completion handlers are to be expensive. These are enabled
121  *     by default.
122  *
123  * ibd_log_sz
124  *     This specifies the size of the ibd log buffer in bytes. The buffer is
125  *     allocated and logging is enabled only when IBD_LOGGING is defined.
126  *
127  */
128 uint_t ibd_tx_copy_thresh = 0x1000;
129 uint_t ibd_num_swqe = 4000;
130 uint_t ibd_num_rwqe = 4000;
131 uint_t ibd_num_lso_bufs = 0x400;
132 uint_t ibd_num_ah = 64;
133 uint_t ibd_hash_size = 32;
134 uint_t ibd_separate_cqs = 1;
135 uint_t ibd_txcomp_poll = 0;
136 uint_t ibd_rx_softintr = 1;
137 uint_t ibd_tx_softintr = 1;
138 #ifdef IBD_LOGGING
139 uint_t ibd_log_sz = 0x20000;
140 #endif
141 
142 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
143 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
144 #define	IBD_NUM_SWQE			ibd_num_swqe
145 #define	IBD_NUM_RWQE			ibd_num_rwqe
146 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
147 #define	IBD_NUM_AH			ibd_num_ah
148 #define	IBD_HASH_SIZE			ibd_hash_size
149 #ifdef IBD_LOGGING
150 #define	IBD_LOG_SZ			ibd_log_sz
151 #endif
152 
153 /*
154  * Receive CQ moderation parameters: NOT tunables
155  */
156 static uint_t ibd_rxcomp_count = 4;
157 static uint_t ibd_rxcomp_usec = 10;
158 
159 /*
160  * Thresholds
161  *
162  * When waiting for resources (swqes or lso buffers) to become available,
163  * the first two thresholds below determine how long to wait before informing
164  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
165  * determines how low the available swqes should go before we start polling
166  * the completion queue.
167  */
168 #define	IBD_FREE_LSOS_THRESH		8
169 #define	IBD_FREE_SWQES_THRESH		20
170 #define	IBD_TX_POLL_THRESH		80
171 
172 /*
173  * When doing multiple-send-wr or multiple-recv-wr posts, this value
174  * determines how many to do at a time (in a single ibt_post_send/recv).
175  */
176 #define	IBD_MAX_POST_MULTIPLE		4
177 
178 /*
179  * Maximum length for returning chained mps back to crossbow
180  */
181 #define	IBD_MAX_RX_MP_LEN		16
182 
183 /*
184  * LSO parameters
185  */
186 #define	IBD_LSO_MAXLEN			65536
187 #define	IBD_LSO_BUFSZ			8192
188 #define	IBD_PROP_LSO_POLICY		"lso-policy"
189 
190 /*
191  * Completion queue polling control
192  */
193 #define	IBD_RX_CQ_POLLING		0x1
194 #define	IBD_TX_CQ_POLLING		0x2
195 #define	IBD_REDO_RX_CQ_POLLING		0x4
196 #define	IBD_REDO_TX_CQ_POLLING		0x8
197 
198 /*
199  * Flag bits for resources to reap
200  */
201 #define	IBD_RSRC_SWQE			0x1
202 #define	IBD_RSRC_LSOBUF			0x2
203 
204 /*
205  * Async operation types
206  */
207 #define	IBD_ASYNC_GETAH			1
208 #define	IBD_ASYNC_JOIN			2
209 #define	IBD_ASYNC_LEAVE			3
210 #define	IBD_ASYNC_PROMON		4
211 #define	IBD_ASYNC_PROMOFF		5
212 #define	IBD_ASYNC_REAP			6
213 #define	IBD_ASYNC_TRAP			7
214 #define	IBD_ASYNC_SCHED			8
215 #define	IBD_ASYNC_LINK			9
216 #define	IBD_ASYNC_EXIT			10
217 
218 /*
219  * Async operation states
220  */
221 #define	IBD_OP_NOTSTARTED		0
222 #define	IBD_OP_ONGOING			1
223 #define	IBD_OP_COMPLETED		2
224 #define	IBD_OP_ERRORED			3
225 #define	IBD_OP_ROUTERED			4
226 
227 /*
228  * Miscellaneous constants
229  */
230 #define	IBD_SEND			0
231 #define	IBD_RECV			1
232 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
233 #ifdef IBD_LOGGING
234 #define	IBD_DMAX_LINE			100
235 #endif
236 
237 /*
238  * Enumerations for link states
239  */
240 typedef enum {
241 	IBD_LINK_DOWN,
242 	IBD_LINK_UP,
243 	IBD_LINK_UP_ABSENT
244 } ibd_link_op_t;
245 
246 /*
247  * Driver State Pointer
248  */
249 void *ibd_list;
250 
251 /*
252  * Logging
253  */
254 #ifdef IBD_LOGGING
255 kmutex_t ibd_lbuf_lock;
256 uint8_t *ibd_lbuf;
257 uint32_t ibd_lbuf_ndx;
258 #endif
259 
260 /*
261  * Required system entry points
262  */
263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
265 
266 /*
267  * Required driver entry points for GLDv3
268  */
269 static int ibd_m_stat(void *, uint_t, uint64_t *);
270 static int ibd_m_start(void *);
271 static void ibd_m_stop(void *);
272 static int ibd_m_promisc(void *, boolean_t);
273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
274 static int ibd_m_unicst(void *, const uint8_t *);
275 static mblk_t *ibd_m_tx(void *, mblk_t *);
276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
277 
278 /*
279  * Private driver entry points for GLDv3
280  */
281 
282 /*
283  * Initialization
284  */
285 static int ibd_state_init(ibd_state_t *, dev_info_t *);
286 static int ibd_drv_init(ibd_state_t *);
287 static int ibd_init_txlist(ibd_state_t *);
288 static int ibd_init_rxlist(ibd_state_t *);
289 static int ibd_acache_init(ibd_state_t *);
290 #ifdef IBD_LOGGING
291 static void ibd_log_init(void);
292 #endif
293 
294 /*
295  * Termination/cleanup
296  */
297 static void ibd_state_fini(ibd_state_t *);
298 static void ibd_drv_fini(ibd_state_t *);
299 static void ibd_fini_txlist(ibd_state_t *);
300 static void ibd_fini_rxlist(ibd_state_t *);
301 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
302 static void ibd_acache_fini(ibd_state_t *);
303 #ifdef IBD_LOGGING
304 static void ibd_log_fini(void);
305 #endif
306 
307 /*
308  * Allocation/acquire/map routines
309  */
310 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
311 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
312 static int ibd_alloc_tx_copybufs(ibd_state_t *);
313 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
314 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
315 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
316     uint32_t *);
317 
318 /*
319  * Free/release/unmap routines
320  */
321 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
322 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
323 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
324 static void ibd_free_tx_copybufs(ibd_state_t *);
325 static void ibd_free_tx_lsobufs(ibd_state_t *);
326 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
327 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
328 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
329 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
330 
331 /*
332  * Handlers/callback routines
333  */
334 static uint_t ibd_intr(char *);
335 static uint_t ibd_tx_recycle(char *);
336 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
337 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
338 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
339 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
340 static void ibd_freemsg_cb(char *);
341 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
342     ibt_async_event_t *);
343 static void ibd_snet_notices_handler(void *, ib_gid_t,
344     ibt_subnet_event_code_t, ibt_subnet_event_t *);
345 
346 /*
347  * Send/receive routines
348  */
349 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
350 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
351 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t);
352 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
353 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
354 
355 /*
356  * Threads
357  */
358 static void ibd_async_work(ibd_state_t *);
359 
360 /*
361  * Async tasks
362  */
363 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
364 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
365 static void ibd_async_setprom(ibd_state_t *);
366 static void ibd_async_unsetprom(ibd_state_t *);
367 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
368 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
369 static void ibd_async_txsched(ibd_state_t *);
370 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
371 
372 /*
373  * Async task helpers
374  */
375 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
376 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
377 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
378 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
379     ipoib_mac_t *, ipoib_mac_t *);
380 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
381 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
382 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
383 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
384 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
385 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
386 static uint64_t ibd_get_portspeed(ibd_state_t *);
387 static int ibd_get_portpkey(ibd_state_t *, ib_guid_t *);
388 static boolean_t ibd_async_safe(ibd_state_t *);
389 static void ibd_async_done(ibd_state_t *);
390 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
391 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
392 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
393 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
394 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
395 
396 /*
397  * Miscellaneous helpers
398  */
399 static int ibd_sched_poll(ibd_state_t *, int, int);
400 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
401 static int ibd_resume_transmission(ibd_state_t *);
402 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
403 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
404 static void *list_get_head(list_t *);
405 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
406 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
407 static void ibd_print_warn(ibd_state_t *, char *, ...);
408 #ifdef IBD_LOGGING
409 static void ibd_log(const char *, ...);
410 #endif
411 
412 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
413     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
414 
415 /* Module Driver Info */
416 static struct modldrv ibd_modldrv = {
417 	&mod_driverops,			/* This one is a driver */
418 	"InfiniBand GLDv3 Driver",	/* short description */
419 	&ibd_dev_ops			/* driver specific ops */
420 };
421 
422 /* Module Linkage */
423 static struct modlinkage ibd_modlinkage = {
424 	MODREV_1, (void *)&ibd_modldrv, NULL
425 };
426 
427 /*
428  * Module (static) info passed to IBTL during ibt_attach
429  */
430 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
431 	IBTI_V_CURR,
432 	IBT_NETWORK,
433 	ibd_async_handler,
434 	NULL,
435 	"IPIB"
436 };
437 
438 /*
439  * GLDv3 entry points
440  */
441 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
442 static mac_callbacks_t ib_m_callbacks = {
443 	IBD_M_CALLBACK_FLAGS,
444 	ibd_m_stat,
445 	ibd_m_start,
446 	ibd_m_stop,
447 	ibd_m_promisc,
448 	ibd_m_multicst,
449 	ibd_m_unicst,
450 	ibd_m_tx,
451 	NULL,
452 	ibd_m_getcapab
453 };
454 
455 /*
456  * Fill/clear <scope> and <p_key> in multicast/broadcast address
457  */
458 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
459 {							\
460 	*(uint32_t *)((char *)(maddr) + 4) |=		\
461 	    htonl((uint32_t)(scope) << 16);		\
462 	*(uint32_t *)((char *)(maddr) + 8) |=		\
463 	    htonl((uint32_t)(pkey) << 16);		\
464 }
465 
466 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
467 {							\
468 	*(uint32_t *)((char *)(maddr) + 4) &=		\
469 	    htonl(~((uint32_t)0xF << 16));		\
470 	*(uint32_t *)((char *)(maddr) + 8) &=		\
471 	    htonl(~((uint32_t)0xFFFF << 16));		\
472 }
473 
474 /*
475  * Rudimentary debugging support
476  */
477 #ifdef DEBUG
478 int ibd_debuglevel = 100;
479 static void
480 debug_print(int l, char *fmt, ...)
481 {
482 	va_list ap;
483 
484 	if (l < ibd_debuglevel)
485 		return;
486 	va_start(ap, fmt);
487 	vcmn_err(CE_CONT, fmt, ap);
488 	va_end(ap);
489 }
490 #define	DPRINT		debug_print
491 #else
492 #define	DPRINT
493 #endif
494 
495 /*
496  * Common routine to print warning messages; adds in hca guid, port number
497  * and pkey to be able to identify the IBA interface.
498  */
499 static void
500 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
501 {
502 	ib_guid_t hca_guid;
503 	char ibd_print_buf[256];
504 	int len;
505 	va_list ap;
506 
507 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
508 	    0, "hca-guid", 0);
509 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
510 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
511 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
512 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
513 	va_start(ap, fmt);
514 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
515 	    fmt, ap);
516 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
517 	va_end(ap);
518 }
519 
520 /*
521  * Warlock directives
522  */
523 
524 /*
525  * id_lso_lock
526  *
527  * state->id_lso->bkt_nfree may be accessed without a lock to
528  * determine the threshold at which we have to ask the nw layer
529  * to resume transmission (see ibd_resume_transmission()).
530  */
531 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
532     ibd_state_t::id_lso))
533 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
534 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
535 
536 /*
537  * id_cq_poll_lock
538  */
539 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
540     ibd_state_t::id_cq_poll_busy))
541 
542 /*
543  * id_txpost_lock
544  */
545 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
546     ibd_state_t::id_tx_head))
547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
548     ibd_state_t::id_tx_busy))
549 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
550     ibd_state_t::id_tx_tailp))
551 
552 /*
553  * id_rxpost_lock
554  */
555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
556     ibd_state_t::id_rx_head))
557 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
558     ibd_state_t::id_rx_busy))
559 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
560     ibd_state_t::id_rx_tailp))
561 
562 /*
563  * id_acache_req_lock
564  */
565 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
566     ibd_state_t::id_acache_req_cv))
567 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
568     ibd_state_t::id_req_list))
569 
570 /*
571  * id_ac_mutex
572  *
573  * This mutex is actually supposed to protect id_ah_op as well,
574  * but this path of the code isn't clean (see update of id_ah_op
575  * in ibd_async_acache(), immediately after the call to
576  * ibd_async_mcache()). For now, we'll skip this check by
577  * declaring that id_ah_op is protected by some internal scheme
578  * that warlock isn't aware of.
579  */
580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
581     ibd_state_t::id_ah_active))
582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
583     ibd_state_t::id_ah_free))
584 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
585     ibd_state_t::id_ah_addr))
586 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
587     ibd_state_t::id_ah_op))
588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
589     ibd_state_t::id_ah_error))
590 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
591 
592 /*
593  * id_mc_mutex
594  */
595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
596     ibd_state_t::id_mc_full))
597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
598     ibd_state_t::id_mc_non))
599 
600 /*
601  * id_trap_lock
602  */
603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
604     ibd_state_t::id_trap_cv))
605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
606     ibd_state_t::id_trap_stop))
607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
608     ibd_state_t::id_trap_inprog))
609 
610 /*
611  * id_prom_op
612  */
613 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
614     ibd_state_t::id_prom_op))
615 
616 /*
617  * id_sched_lock
618  */
619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
620     ibd_state_t::id_sched_needed))
621 
622 /*
623  * id_link_mutex
624  */
625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
626     ibd_state_t::id_link_state))
627 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
628 _NOTE(SCHEME_PROTECTS_DATA("only async thr and drv init",
629     ibd_state_t::id_link_speed))
630 
631 /*
632  * id_tx_list.dl_mutex
633  */
634 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
635     ibd_state_t::id_tx_list.dl_head))
636 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
637     ibd_state_t::id_tx_list.dl_tail))
638 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
639     ibd_state_t::id_tx_list.dl_pending_sends))
640 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
641     ibd_state_t::id_tx_list.dl_cnt))
642 
643 /*
644  * id_rx_list.dl_mutex
645  */
646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
647     ibd_state_t::id_rx_list.dl_head))
648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
649     ibd_state_t::id_rx_list.dl_tail))
650 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
651     ibd_state_t::id_rx_list.dl_bufs_outstanding))
652 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
653     ibd_state_t::id_rx_list.dl_cnt))
654 
655 
656 /*
657  * Items protected by atomic updates
658  */
659 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
660     ibd_state_s::id_brd_rcv
661     ibd_state_s::id_brd_xmt
662     ibd_state_s::id_multi_rcv
663     ibd_state_s::id_multi_xmt
664     ibd_state_s::id_num_intrs
665     ibd_state_s::id_rcv_bytes
666     ibd_state_s::id_rcv_pkt
667     ibd_state_s::id_tx_short
668     ibd_state_s::id_xmt_bytes
669     ibd_state_s::id_xmt_pkt))
670 
671 /*
672  * Non-mutex protection schemes for data elements. Almost all of
673  * these are non-shared items.
674  */
675 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
676     callb_cpr
677     ib_gid_s
678     ib_header_info
679     ibd_acache_rq
680     ibd_acache_s::ac_mce
681     ibd_mcache::mc_fullreap
682     ibd_mcache::mc_jstate
683     ibd_mcache::mc_req
684     ibd_rwqe_s
685     ibd_swqe_s
686     ibd_wqe_s
687     ibt_wr_ds_s::ds_va
688     ibt_wr_lso_s
689     ipoib_mac::ipoib_qpn
690     mac_capab_lso_s
691     msgb::b_next
692     msgb::b_rptr
693     msgb::b_wptr))
694 
695 int
696 _init()
697 {
698 	int status;
699 
700 	/*
701 	 * Sanity check some parameter settings. Tx completion polling
702 	 * only makes sense with separate CQs for Tx and Rx.
703 	 */
704 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
705 		cmn_err(CE_NOTE, "!ibd: %s",
706 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
707 		ibd_txcomp_poll = 0;
708 	}
709 
710 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
711 	if (status != 0) {
712 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
713 		return (status);
714 	}
715 
716 	mac_init_ops(&ibd_dev_ops, "ibd");
717 	status = mod_install(&ibd_modlinkage);
718 	if (status != 0) {
719 		DPRINT(10, "_init:failed in mod_install()");
720 		ddi_soft_state_fini(&ibd_list);
721 		mac_fini_ops(&ibd_dev_ops);
722 		return (status);
723 	}
724 
725 #ifdef IBD_LOGGING
726 	ibd_log_init();
727 #endif
728 	return (0);
729 }
730 
731 int
732 _info(struct modinfo *modinfop)
733 {
734 	return (mod_info(&ibd_modlinkage, modinfop));
735 }
736 
737 int
738 _fini()
739 {
740 	int status;
741 
742 	status = mod_remove(&ibd_modlinkage);
743 	if (status != 0)
744 		return (status);
745 
746 	mac_fini_ops(&ibd_dev_ops);
747 	ddi_soft_state_fini(&ibd_list);
748 #ifdef IBD_LOGGING
749 	ibd_log_fini();
750 #endif
751 	return (0);
752 }
753 
754 /*
755  * Convert the GID part of the mac address from network byte order
756  * to host order.
757  */
758 static void
759 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
760 {
761 	ib_sn_prefix_t nbopref;
762 	ib_guid_t nboguid;
763 
764 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
765 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
766 	dgid->gid_prefix = b2h64(nbopref);
767 	dgid->gid_guid = b2h64(nboguid);
768 }
769 
770 /*
771  * Create the IPoIB address in network byte order from host order inputs.
772  */
773 static void
774 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
775     ib_guid_t guid)
776 {
777 	ib_sn_prefix_t nbopref;
778 	ib_guid_t nboguid;
779 
780 	mac->ipoib_qpn = htonl(qpn);
781 	nbopref = h2b64(prefix);
782 	nboguid = h2b64(guid);
783 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
784 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
785 }
786 
787 /*
788  * Send to the appropriate all-routers group when the IBA multicast group
789  * does not exist, based on whether the target group is v4 or v6.
790  */
791 static boolean_t
792 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
793     ipoib_mac_t *rmac)
794 {
795 	boolean_t retval = B_TRUE;
796 	uint32_t adjscope = state->id_scope << 16;
797 	uint32_t topword;
798 
799 	/*
800 	 * Copy the first 4 bytes in without assuming any alignment of
801 	 * input mac address; this will have IPoIB signature, flags and
802 	 * scope bits.
803 	 */
804 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
805 	topword = ntohl(topword);
806 
807 	/*
808 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
809 	 */
810 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
811 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
812 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
813 		    ((uint32_t)(state->id_pkey << 16))),
814 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
815 	else
816 		/*
817 		 * Does not have proper bits in the mgid address.
818 		 */
819 		retval = B_FALSE;
820 
821 	return (retval);
822 }
823 
824 /*
825  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
826  * front of optional src/tgt link layer address. Right now Solaris inserts
827  * padding by default at the end. The routine which is doing is nce_xmit()
828  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
829  * the packet comes down from IP layer to the IBD driver, it is in the
830  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
831  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
832  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
833  *
834  * The send routine at IBD driver changes this packet as follows:
835  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
836  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
837  * aligned.
838  *
839  * At the receiving side again ibd_process_rx takes the above packet and
840  * removes the two bytes of front padding and inserts it at the end. This
841  * is since the IP layer does not understand padding at the front.
842  */
843 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
844 	uchar_t 	*nd_lla_ptr;					\
845 	icmp6_t 	*icmp6;						\
846 	nd_opt_hdr_t	*opt;						\
847 	int 		i;						\
848 									\
849 	icmp6 = (icmp6_t *)&ip6h[1];					\
850 	len -= sizeof (nd_neighbor_advert_t);				\
851 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
852 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
853 	    (len != 0)) {						\
854 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
855 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
856 		ASSERT(opt != NULL);					\
857 		nd_lla_ptr = (uchar_t *)&opt[1];			\
858 		if (type == IBD_SEND) {					\
859 			for (i = IPOIB_ADDRL; i > 0; i--)		\
860 				*(nd_lla_ptr + i + 1) =			\
861 				    *(nd_lla_ptr + i - 1);		\
862 		} else {						\
863 			for (i = 0; i < IPOIB_ADDRL; i++)		\
864 				*(nd_lla_ptr + i) =			\
865 				    *(nd_lla_ptr + i + 2);		\
866 		}							\
867 		*(nd_lla_ptr + i) = 0;					\
868 		*(nd_lla_ptr + i + 1) = 0;				\
869 	}								\
870 }
871 
872 /*
873  * Address handle entries maintained by the driver are kept in the
874  * free and active lists. Each entry starts out in the free list;
875  * it migrates to the active list when primed using ibt_get_paths()
876  * and ibt_modify_ud_dest() for transmission to a specific destination.
877  * In the active list, the entry has a reference count indicating the
878  * number of ongoing/uncompleted transmits that reference it. The
879  * entry is left in the active list even after the reference count
880  * goes to 0, since successive transmits can find it there and do
881  * not need to set up another entry (ie the path information is
882  * cached using the active list). Entries on the active list are
883  * also hashed using the destination link address as a key for faster
884  * lookups during transmits.
885  *
886  * For any destination address (unicast or multicast, whatever the
887  * join states), there will be at most one entry in the active list.
888  * Entries with a 0 reference count on the active list can be reused
889  * for a transmit to a new destination, if the free list is empty.
890  *
891  * The AH free list insertion/deletion is protected with the id_ac_mutex,
892  * since the async thread and Tx callback handlers insert/delete. The
893  * active list does not need a lock (all operations are done by the
894  * async thread) but updates to the reference count are atomically
895  * done (increments done by Tx path, decrements by the Tx callback handler).
896  */
897 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
898 	list_insert_head(&state->id_ah_free, ce)
899 #define	IBD_ACACHE_GET_FREE(state) \
900 	list_get_head(&state->id_ah_free)
901 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
902 	int _ret_;						\
903 	list_insert_head(&state->id_ah_active, ce);		\
904 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
905 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
906 	ASSERT(_ret_ == 0);					\
907 }
908 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
909 	list_remove(&state->id_ah_active, ce);			\
910 	(void) mod_hash_remove(state->id_ah_active_hash,	\
911 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
912 }
913 #define	IBD_ACACHE_GET_ACTIVE(state) \
914 	list_get_head(&state->id_ah_active)
915 
916 /*
917  * Membership states for different mcg's are tracked by two lists:
918  * the "non" list is used for promiscuous mode, when all mcg traffic
919  * needs to be inspected. This type of membership is never used for
920  * transmission, so there can not be an AH in the active list
921  * corresponding to a member in this list. This list does not need
922  * any protection, since all operations are performed by the async
923  * thread.
924  *
925  * "Full" and "SendOnly" membership is tracked using a single list,
926  * the "full" list. This is because this single list can then be
927  * searched during transmit to a multicast group (if an AH for the
928  * mcg is not found in the active list), since at least one type
929  * of membership must be present before initiating the transmit.
930  * This list is also emptied during driver detach, since sendonly
931  * membership acquired during transmit is dropped at detach time
932  * alongwith ipv4 broadcast full membership. Insert/deletes to
933  * this list are done only by the async thread, but it is also
934  * searched in program context (see multicast disable case), thus
935  * the id_mc_mutex protects the list. The driver detach path also
936  * deconstructs the "full" list, but it ensures that the async
937  * thread will not be accessing the list (by blocking out mcg
938  * trap handling and making sure no more Tx reaping will happen).
939  *
940  * Currently, an IBA attach is done in the SendOnly case too,
941  * although this is not required.
942  */
943 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
944 	list_insert_head(&state->id_mc_full, mce)
945 #define	IBD_MCACHE_INSERT_NON(state, mce) \
946 	list_insert_head(&state->id_mc_non, mce)
947 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
948 	ibd_mcache_find(mgid, &state->id_mc_full)
949 #define	IBD_MCACHE_FIND_NON(state, mgid) \
950 	ibd_mcache_find(mgid, &state->id_mc_non)
951 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
952 	list_remove(&state->id_mc_full, mce)
953 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
954 	list_remove(&state->id_mc_non, mce)
955 
956 /*
957  * AH and MCE active list manipulation:
958  *
959  * Multicast disable requests and MCG delete traps are two cases
960  * where the active AH entry for the mcg (if any unreferenced one exists)
961  * will be moved to the free list (to force the next Tx to the mcg to
962  * join the MCG in SendOnly mode). Port up handling will also move AHs
963  * from active to free list.
964  *
965  * In the case when some transmits are still pending on an entry
966  * for an mcg, but a multicast disable has already been issued on the
967  * mcg, there are some options to consider to preserve the join state
968  * to ensure the emitted packet is properly routed on the IBA fabric.
969  * For the AH, we can
970  * 1. take out of active list at multicast disable time.
971  * 2. take out of active list only when last pending Tx completes.
972  * For the MCE, we can
973  * 3. take out of active list at multicast disable time.
974  * 4. take out of active list only when last pending Tx completes.
975  * 5. move from active list to stale list at multicast disable time.
976  * We choose to use 2,4. We use option 4 so that if a multicast enable
977  * is tried before the pending Tx completes, the enable code finds the
978  * mce in the active list and just has to make sure it will not be reaped
979  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
980  * a stale list (#5) that would be checked in the enable code would need
981  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
982  * after the multicast disable would try to put an AH in the active list,
983  * and associate the mce it finds in the active list to this new AH,
984  * whereas the mce is already associated with the previous AH (taken off
985  * the active list), and will be removed once the pending Tx's complete
986  * (unless a reference count on mce's is implemented). One implication of
987  * using 2,4 is that new Tx's posted before the pending Tx's complete will
988  * grab new references on the AH, further delaying the leave.
989  *
990  * In the case of mcg delete (or create) trap when the port is sendonly
991  * joined, the AH and MCE handling is different: the AH and MCE has to be
992  * immediately taken off the active lists (forcing a join and path lookup
993  * at the next Tx is the only guaranteed means of ensuring a proper Tx
994  * to an mcg as it is repeatedly created and deleted and goes thru
995  * reincarnations).
996  *
997  * When a port is already sendonly joined, and a multicast enable is
998  * attempted, the same mce structure is promoted; this ensures only a
999  * single mce on the active list tracks the most powerful join state.
1000  *
1001  * In the case of port up event handling, the MCE for sendonly membership
1002  * is freed up, and the ACE is put into the free list as soon as possible
1003  * (depending on whether posted Tx's have completed). For fullmembership
1004  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1005  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1006  * done; else the mce is deconstructed (mc_fullreap case).
1007  *
1008  * MCG creation and deletion trap handling:
1009  *
1010  * These traps are unreliable (meaning sometimes the trap might never
1011  * be delivered to the subscribed nodes) and may arrive out-of-order
1012  * since they use UD transport. An alternative to relying on these
1013  * unreliable traps is to poll for mcg presence every so often, but
1014  * instead of doing that, we try to be as conservative as possible
1015  * while handling the traps, and hope that the traps do arrive at
1016  * the subscribed nodes soon. Note that if a node is fullmember
1017  * joined to an mcg, it can not possibly receive a mcg create/delete
1018  * trap for that mcg (by fullmember definition); if it does, it is
1019  * an old trap from a previous incarnation of the mcg.
1020  *
1021  * Whenever a trap is received, the driver cleans up its sendonly
1022  * membership to the group; we choose to do a sendonly leave even
1023  * on a creation trap to handle the case of a prior deletion of the mcg
1024  * having gone unnoticed. Consider an example scenario:
1025  * T1: MCG M is deleted, and fires off deletion trap D1.
1026  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1027  * T3: Node N tries to transmit to M, joining in sendonly mode.
1028  * T4: MCG M is deleted, and fires off deletion trap D2.
1029  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1030  *     If the trap is D2, then a LEAVE is not required, since the mcg
1031  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1032  *     approach is to always LEAVE, but the SM may be confused if it
1033  *     receives a LEAVE without a prior JOIN.
1034  *
1035  * Management of the non-membership to an mcg is similar to the above,
1036  * except that if the interface is in promiscuous mode, it is required
1037  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1038  * if the re-join attempt fails (in which case a warning message needs
1039  * to be printed), it is not clear whether it failed due to the mcg not
1040  * existing, or some fabric/hca issues, due to the delayed nature of
1041  * trap delivery. Querying the SA to establish presence/absence of the
1042  * mcg is also racy at best. Thus, the driver just prints a warning
1043  * message when it can not rejoin after receiving a create trap, although
1044  * this might be (on rare occassions) a mis-warning if the create trap is
1045  * received after the mcg was deleted.
1046  */
1047 
1048 /*
1049  * Implementation of atomic "recycle" bits and reference count
1050  * on address handles. This utilizes the fact that max reference
1051  * count on any handle is limited by number of send wqes, thus
1052  * high bits in the ac_ref field can be used as the recycle bits,
1053  * and only the low bits hold the number of pending Tx requests.
1054  * This atomic AH reference counting allows the Tx completion
1055  * handler not to acquire the id_ac_mutex to process every completion,
1056  * thus reducing lock contention problems between completion and
1057  * the Tx path.
1058  */
1059 #define	CYCLEVAL		0x80000
1060 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1061 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1062 #define	GET_REF(ace)		((ace)->ac_ref)
1063 #define	GET_REF_CYCLE(ace) (				\
1064 	/*						\
1065 	 * Make sure "cycle" bit is set.		\
1066 	 */						\
1067 	ASSERT(CYCLE_SET(ace)),				\
1068 	((ace)->ac_ref & ~(CYCLEVAL))			\
1069 )
1070 #define	INC_REF(ace, num) {				\
1071 	atomic_add_32(&(ace)->ac_ref, num);		\
1072 }
1073 #define	SET_CYCLE_IF_REF(ace) (				\
1074 	CYCLE_SET(ace) ? B_TRUE :			\
1075 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1076 		CYCLEVAL ?				\
1077 		/*					\
1078 		 * Clear the "cycle" bit we just set;	\
1079 		 * ref count known to be 0 from above.	\
1080 		 */					\
1081 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1082 		/*					\
1083 		 * We set "cycle" bit; let caller know.	\
1084 		 */					\
1085 		B_TRUE					\
1086 )
1087 #define	DEC_REF_DO_CYCLE(ace) (				\
1088 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1089 	    CYCLEVAL ?					\
1090 		/*					\
1091 		 * Ref count known to be 0 from above.	\
1092 		 */					\
1093 		B_TRUE :				\
1094 		B_FALSE					\
1095 )
1096 
1097 static void *
1098 list_get_head(list_t *list)
1099 {
1100 	list_node_t *lhead = list_head(list);
1101 
1102 	if (lhead != NULL)
1103 		list_remove(list, lhead);
1104 	return (lhead);
1105 }
1106 
1107 /*
1108  * This is always guaranteed to be able to queue the work.
1109  */
1110 static void
1111 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1112 {
1113 	/* Initialize request */
1114 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1115 	ptr->rq_op = op;
1116 
1117 	/*
1118 	 * Queue provided slot onto request pool.
1119 	 */
1120 	mutex_enter(&state->id_acache_req_lock);
1121 	list_insert_tail(&state->id_req_list, ptr);
1122 
1123 	/* Go, fetch, async thread */
1124 	cv_signal(&state->id_acache_req_cv);
1125 	mutex_exit(&state->id_acache_req_lock);
1126 }
1127 
1128 /*
1129  * Main body of the per interface async thread.
1130  */
1131 static void
1132 ibd_async_work(ibd_state_t *state)
1133 {
1134 	ibd_req_t *ptr;
1135 	callb_cpr_t cprinfo;
1136 
1137 	mutex_enter(&state->id_acache_req_lock);
1138 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1139 	    callb_generic_cpr, "ibd_async_work");
1140 
1141 	for (;;) {
1142 		ptr = list_get_head(&state->id_req_list);
1143 		if (ptr != NULL) {
1144 			mutex_exit(&state->id_acache_req_lock);
1145 
1146 			/*
1147 			 * Once we have done the operation, there is no
1148 			 * guarantee the request slot is going to be valid,
1149 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1150 			 * TRAP).
1151 			 *
1152 			 * Perform the request.
1153 			 */
1154 			switch (ptr->rq_op) {
1155 				case IBD_ASYNC_GETAH:
1156 					ibd_async_acache(state, &ptr->rq_mac);
1157 					break;
1158 				case IBD_ASYNC_JOIN:
1159 				case IBD_ASYNC_LEAVE:
1160 					ibd_async_multicast(state,
1161 					    ptr->rq_gid, ptr->rq_op);
1162 					break;
1163 				case IBD_ASYNC_PROMON:
1164 					ibd_async_setprom(state);
1165 					break;
1166 				case IBD_ASYNC_PROMOFF:
1167 					ibd_async_unsetprom(state);
1168 					break;
1169 				case IBD_ASYNC_REAP:
1170 					ibd_async_reap_group(state,
1171 					    ptr->rq_ptr, ptr->rq_gid,
1172 					    IB_MC_JSTATE_FULL);
1173 					/*
1174 					 * the req buf contains in mce
1175 					 * structure, so we do not need
1176 					 * to free it here.
1177 					 */
1178 					ptr = NULL;
1179 					break;
1180 				case IBD_ASYNC_TRAP:
1181 					ibd_async_trap(state, ptr);
1182 					break;
1183 				case IBD_ASYNC_SCHED:
1184 					ibd_async_txsched(state);
1185 					break;
1186 				case IBD_ASYNC_LINK:
1187 					ibd_async_link(state, ptr);
1188 					break;
1189 				case IBD_ASYNC_EXIT:
1190 					mutex_enter(&state->id_acache_req_lock);
1191 #ifndef __lock_lint
1192 					CALLB_CPR_EXIT(&cprinfo);
1193 #else
1194 					mutex_exit(&state->id_acache_req_lock);
1195 #endif
1196 					return;
1197 			}
1198 			if (ptr != NULL)
1199 				kmem_cache_free(state->id_req_kmc, ptr);
1200 
1201 			mutex_enter(&state->id_acache_req_lock);
1202 		} else {
1203 #ifndef __lock_lint
1204 			/*
1205 			 * Nothing to do: wait till new request arrives.
1206 			 */
1207 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1208 			cv_wait(&state->id_acache_req_cv,
1209 			    &state->id_acache_req_lock);
1210 			CALLB_CPR_SAFE_END(&cprinfo,
1211 			    &state->id_acache_req_lock);
1212 #endif
1213 		}
1214 	}
1215 
1216 	/*NOTREACHED*/
1217 	_NOTE(NOT_REACHED)
1218 }
1219 
1220 /*
1221  * Return when it is safe to queue requests to the async daemon; primarily
1222  * for subnet trap and async event handling. Disallow requests before the
1223  * daemon is created, and when interface deinitilization starts.
1224  */
1225 static boolean_t
1226 ibd_async_safe(ibd_state_t *state)
1227 {
1228 	mutex_enter(&state->id_trap_lock);
1229 	if (state->id_trap_stop) {
1230 		mutex_exit(&state->id_trap_lock);
1231 		return (B_FALSE);
1232 	}
1233 	state->id_trap_inprog++;
1234 	mutex_exit(&state->id_trap_lock);
1235 	return (B_TRUE);
1236 }
1237 
1238 /*
1239  * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
1240  * trap or event handling to complete to kill the async thread and deconstruct
1241  * the mcg/ace list.
1242  */
1243 static void
1244 ibd_async_done(ibd_state_t *state)
1245 {
1246 	mutex_enter(&state->id_trap_lock);
1247 	if (--state->id_trap_inprog == 0)
1248 		cv_signal(&state->id_trap_cv);
1249 	mutex_exit(&state->id_trap_lock);
1250 }
1251 
1252 /*
1253  * Hash functions:
1254  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1255  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1256  * These operate on mac addresses input into ibd_send, but there is no
1257  * guarantee on the alignment of the ipoib_mac_t structure.
1258  */
1259 /*ARGSUSED*/
1260 static uint_t
1261 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1262 {
1263 	ulong_t ptraddr = (ulong_t)key;
1264 	uint_t hval;
1265 
1266 	/*
1267 	 * If the input address is 4 byte aligned, we can just dereference
1268 	 * it. This is most common, since IP will send in a 4 byte aligned
1269 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1270 	 * 4 byte aligned too.
1271 	 */
1272 	if ((ptraddr & 3) == 0)
1273 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1274 
1275 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1276 	return (hval);
1277 }
1278 
1279 static int
1280 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1281 {
1282 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1283 		return (0);
1284 	else
1285 		return (1);
1286 }
1287 
1288 /*
1289  * Initialize all the per interface caches and lists; AH cache,
1290  * MCG list etc.
1291  */
1292 static int
1293 ibd_acache_init(ibd_state_t *state)
1294 {
1295 	ibd_ace_t *ce;
1296 	int i;
1297 
1298 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1299 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1300 
1301 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1302 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1303 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1304 	    offsetof(ibd_ace_t, ac_list));
1305 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1306 	    offsetof(ibd_ace_t, ac_list));
1307 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1308 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1309 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1310 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1311 	    offsetof(ibd_mce_t, mc_list));
1312 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1313 	    offsetof(ibd_mce_t, mc_list));
1314 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1315 	    offsetof(ibd_req_t, rq_list));
1316 
1317 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1318 	    IBD_NUM_AH, KM_SLEEP);
1319 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1320 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1321 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1322 			ibd_acache_fini(state);
1323 			return (DDI_FAILURE);
1324 		} else {
1325 			CLEAR_REFCYCLE(ce);
1326 			ce->ac_mce = NULL;
1327 			IBD_ACACHE_INSERT_FREE(state, ce);
1328 		}
1329 	}
1330 	return (DDI_SUCCESS);
1331 }
1332 
1333 static void
1334 ibd_acache_fini(ibd_state_t *state)
1335 {
1336 	ibd_ace_t *ptr;
1337 
1338 	mutex_enter(&state->id_ac_mutex);
1339 
1340 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1341 		ASSERT(GET_REF(ptr) == 0);
1342 		(void) ibt_free_ud_dest(ptr->ac_dest);
1343 	}
1344 
1345 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1346 		ASSERT(GET_REF(ptr) == 0);
1347 		(void) ibt_free_ud_dest(ptr->ac_dest);
1348 	}
1349 
1350 	list_destroy(&state->id_ah_free);
1351 	list_destroy(&state->id_ah_active);
1352 	list_destroy(&state->id_mc_full);
1353 	list_destroy(&state->id_mc_non);
1354 	list_destroy(&state->id_req_list);
1355 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1356 	mutex_exit(&state->id_ac_mutex);
1357 	mutex_destroy(&state->id_ac_mutex);
1358 	mutex_destroy(&state->id_mc_mutex);
1359 	mutex_destroy(&state->id_acache_req_lock);
1360 	cv_destroy(&state->id_acache_req_cv);
1361 }
1362 
1363 /*
1364  * Search AH active hash list for a cached path to input destination.
1365  * If we are "just looking", hold == F. When we are in the Tx path,
1366  * we set hold == T to grab a reference on the AH so that it can not
1367  * be recycled to a new destination while the Tx request is posted.
1368  */
1369 static ibd_ace_t *
1370 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1371 {
1372 	ibd_ace_t *ptr;
1373 
1374 	ASSERT(mutex_owned(&state->id_ac_mutex));
1375 
1376 	/*
1377 	 * Do hash search.
1378 	 */
1379 	if (mod_hash_find(state->id_ah_active_hash,
1380 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1381 		if (hold)
1382 			INC_REF(ptr, num);
1383 		return (ptr);
1384 	}
1385 	return (NULL);
1386 }
1387 
1388 /*
1389  * This is called by the tx side; if an initialized AH is found in
1390  * the active list, it is locked down and can be used; if no entry
1391  * is found, an async request is queued to do path resolution.
1392  */
1393 static ibd_ace_t *
1394 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1395 {
1396 	ibd_ace_t *ptr;
1397 	ibd_req_t *req;
1398 
1399 	/*
1400 	 * Only attempt to print when we can; in the mdt pattr case, the
1401 	 * address is not aligned properly.
1402 	 */
1403 	if (((ulong_t)mac & 3) == 0) {
1404 		DPRINT(4,
1405 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1406 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1407 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1408 		    htonl(mac->ipoib_gidsuff[1]));
1409 	}
1410 
1411 	mutex_enter(&state->id_ac_mutex);
1412 
1413 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1414 		mutex_exit(&state->id_ac_mutex);
1415 		return (ptr);
1416 	}
1417 
1418 	/*
1419 	 * Implementation of a single outstanding async request; if
1420 	 * the operation is not started yet, queue a request and move
1421 	 * to ongoing state. Remember in id_ah_addr for which address
1422 	 * we are queueing the request, in case we need to flag an error;
1423 	 * Any further requests, for the same or different address, until
1424 	 * the operation completes, is sent back to GLDv3 to be retried.
1425 	 * The async thread will update id_ah_op with an error indication
1426 	 * or will set it to indicate the next look up can start; either
1427 	 * way, it will mac_tx_update() so that all blocked requests come
1428 	 * back here.
1429 	 */
1430 	*err = EAGAIN;
1431 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1432 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1433 		if (req != NULL) {
1434 			/*
1435 			 * We did not even find the entry; queue a request
1436 			 * for it.
1437 			 */
1438 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1439 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1440 			state->id_ah_op = IBD_OP_ONGOING;
1441 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1442 		}
1443 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1444 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1445 		/*
1446 		 * Check the status of the pathrecord lookup request
1447 		 * we had queued before.
1448 		 */
1449 		if (state->id_ah_op == IBD_OP_ERRORED) {
1450 			*err = EFAULT;
1451 			state->id_ah_error++;
1452 		} else {
1453 			/*
1454 			 * IBD_OP_ROUTERED case: We need to send to the
1455 			 * all-router MCG. If we can find the AH for
1456 			 * the mcg, the Tx will be attempted. If we
1457 			 * do not find the AH, we return NORESOURCES
1458 			 * to retry.
1459 			 */
1460 			ipoib_mac_t routermac;
1461 
1462 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1463 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1464 			    numwqe);
1465 		}
1466 		state->id_ah_op = IBD_OP_NOTSTARTED;
1467 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1468 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1469 		/*
1470 		 * This case can happen when we get a higher band
1471 		 * packet. The easiest way is to reset the state machine
1472 		 * to accommodate the higher priority packet.
1473 		 */
1474 		state->id_ah_op = IBD_OP_NOTSTARTED;
1475 	}
1476 	mutex_exit(&state->id_ac_mutex);
1477 
1478 	return (ptr);
1479 }
1480 
1481 /*
1482  * Grab a not-currently-in-use AH/PathRecord from the active
1483  * list to recycle to a new destination. Only the async thread
1484  * executes this code.
1485  */
1486 static ibd_ace_t *
1487 ibd_acache_get_unref(ibd_state_t *state)
1488 {
1489 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1490 
1491 	ASSERT(mutex_owned(&state->id_ac_mutex));
1492 
1493 	/*
1494 	 * Do plain linear search.
1495 	 */
1496 	while (ptr != NULL) {
1497 		/*
1498 		 * Note that it is possible that the "cycle" bit
1499 		 * is set on the AH w/o any reference count. The
1500 		 * mcg must have been deleted, and the tx cleanup
1501 		 * just decremented the reference count to 0, but
1502 		 * hasn't gotten around to grabbing the id_ac_mutex
1503 		 * to move the AH into the free list.
1504 		 */
1505 		if (GET_REF(ptr) == 0) {
1506 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1507 			break;
1508 		}
1509 		ptr = list_next(&state->id_ah_active, ptr);
1510 	}
1511 	return (ptr);
1512 }
1513 
1514 /*
1515  * Invoked to clean up AH from active list in case of multicast
1516  * disable and to handle sendonly memberships during mcg traps.
1517  * And for port up processing for multicast and unicast AHs.
1518  * Normally, the AH is taken off the active list, and put into
1519  * the free list to be recycled for a new destination. In case
1520  * Tx requests on the AH have not completed yet, the AH is marked
1521  * for reaping (which will put the AH on the free list) once the Tx's
1522  * complete; in this case, depending on the "force" input, we take
1523  * out the AH from the active list right now, or leave it also for
1524  * the reap operation. Returns TRUE if the AH is taken off the active
1525  * list (and either put into the free list right now, or arranged for
1526  * later), FALSE otherwise.
1527  */
1528 static boolean_t
1529 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1530 {
1531 	ibd_ace_t *acactive;
1532 	boolean_t ret = B_TRUE;
1533 
1534 	ASSERT(mutex_owned(&state->id_ac_mutex));
1535 
1536 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1537 
1538 		/*
1539 		 * Note that the AH might already have the cycle bit set
1540 		 * on it; this might happen if sequences of multicast
1541 		 * enables and disables are coming so fast, that posted
1542 		 * Tx's to the mcg have not completed yet, and the cycle
1543 		 * bit is set successively by each multicast disable.
1544 		 */
1545 		if (SET_CYCLE_IF_REF(acactive)) {
1546 			if (!force) {
1547 				/*
1548 				 * The ace is kept on the active list, further
1549 				 * Tx's can still grab a reference on it; the
1550 				 * ace is reaped when all pending Tx's
1551 				 * referencing the AH complete.
1552 				 */
1553 				ret = B_FALSE;
1554 			} else {
1555 				/*
1556 				 * In the mcg trap case, we always pull the
1557 				 * AH from the active list. And also the port
1558 				 * up multi/unicast case.
1559 				 */
1560 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1561 				acactive->ac_mce = NULL;
1562 			}
1563 		} else {
1564 			/*
1565 			 * Determined the ref count is 0, thus reclaim
1566 			 * immediately after pulling out the ace from
1567 			 * the active list.
1568 			 */
1569 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1570 			acactive->ac_mce = NULL;
1571 			IBD_ACACHE_INSERT_FREE(state, acactive);
1572 		}
1573 
1574 	}
1575 	return (ret);
1576 }
1577 
1578 /*
1579  * Helper function for async path record lookup. If we are trying to
1580  * Tx to a MCG, check our membership, possibly trying to join the
1581  * group if required. If that fails, try to send the packet to the
1582  * all router group (indicated by the redirect output), pointing
1583  * the input mac address to the router mcg address.
1584  */
1585 static ibd_mce_t *
1586 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1587 {
1588 	ib_gid_t mgid;
1589 	ibd_mce_t *mce;
1590 	ipoib_mac_t routermac;
1591 
1592 	*redirect = B_FALSE;
1593 	ibd_n2h_gid(mac, &mgid);
1594 
1595 	/*
1596 	 * Check the FullMember+SendOnlyNonMember list.
1597 	 * Since we are the only one who manipulates the
1598 	 * id_mc_full list, no locks are needed.
1599 	 */
1600 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1601 	if (mce != NULL) {
1602 		DPRINT(4, "ibd_async_mcache : already joined to group");
1603 		return (mce);
1604 	}
1605 
1606 	/*
1607 	 * Not found; try to join(SendOnlyNonMember) and attach.
1608 	 */
1609 	DPRINT(4, "ibd_async_mcache : not joined to group");
1610 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1611 	    NULL) {
1612 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1613 		return (mce);
1614 	}
1615 
1616 	/*
1617 	 * MCGroup not present; try to join the all-router group. If
1618 	 * any of the following steps succeed, we will be redirecting
1619 	 * to the all router group.
1620 	 */
1621 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1622 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1623 		return (NULL);
1624 	*redirect = B_TRUE;
1625 	ibd_n2h_gid(&routermac, &mgid);
1626 	bcopy(&routermac, mac, IPOIB_ADDRL);
1627 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1628 	    mgid.gid_prefix, mgid.gid_guid);
1629 
1630 	/*
1631 	 * Are we already joined to the router group?
1632 	 */
1633 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1634 		DPRINT(4, "ibd_async_mcache : using already joined router"
1635 		    "group\n");
1636 		return (mce);
1637 	}
1638 
1639 	/*
1640 	 * Can we join(SendOnlyNonMember) the router group?
1641 	 */
1642 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1643 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1644 	    NULL) {
1645 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1646 		return (mce);
1647 	}
1648 
1649 	return (NULL);
1650 }
1651 
1652 /*
1653  * Async path record lookup code.
1654  */
1655 static void
1656 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1657 {
1658 	ibd_ace_t *ce;
1659 	ibd_mce_t *mce = NULL;
1660 	ibt_path_attr_t path_attr;
1661 	ibt_path_info_t path_info;
1662 	ib_gid_t destgid;
1663 	int ret = IBD_OP_NOTSTARTED;
1664 
1665 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1666 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1667 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1668 	    htonl(mac->ipoib_gidsuff[1]));
1669 
1670 	/*
1671 	 * Check whether we are trying to transmit to a MCG.
1672 	 * In that case, we need to make sure we are a member of
1673 	 * the MCG.
1674 	 */
1675 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1676 		boolean_t redirected;
1677 
1678 		/*
1679 		 * If we can not find or join the group or even
1680 		 * redirect, error out.
1681 		 */
1682 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1683 		    NULL) {
1684 			state->id_ah_op = IBD_OP_ERRORED;
1685 			return;
1686 		}
1687 
1688 		/*
1689 		 * If we got redirected, we need to determine whether
1690 		 * the AH for the new mcg is in the cache already, and
1691 		 * not pull it in then; otherwise proceed to get the
1692 		 * path for the new mcg. There is no guarantee that
1693 		 * if the AH is currently in the cache, it will still be
1694 		 * there when we look in ibd_acache_lookup(), but that's
1695 		 * okay, we will come back here.
1696 		 */
1697 		if (redirected) {
1698 			ret = IBD_OP_ROUTERED;
1699 			DPRINT(4, "ibd_async_acache :  redirected to "
1700 			    "%08X:%08X:%08X:%08X:%08X",
1701 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1702 			    htonl(mac->ipoib_gidpref[1]),
1703 			    htonl(mac->ipoib_gidsuff[0]),
1704 			    htonl(mac->ipoib_gidsuff[1]));
1705 
1706 			mutex_enter(&state->id_ac_mutex);
1707 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1708 				state->id_ah_op = IBD_OP_ROUTERED;
1709 				mutex_exit(&state->id_ac_mutex);
1710 				DPRINT(4, "ibd_async_acache : router AH found");
1711 				return;
1712 			}
1713 			mutex_exit(&state->id_ac_mutex);
1714 		}
1715 	}
1716 
1717 	/*
1718 	 * Get an AH from the free list.
1719 	 */
1720 	mutex_enter(&state->id_ac_mutex);
1721 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1722 		/*
1723 		 * No free ones; try to grab an unreferenced active
1724 		 * one. Maybe we need to make the active list LRU,
1725 		 * but that will create more work for Tx callbacks.
1726 		 * Is there a way of not having to pull out the
1727 		 * entry from the active list, but just indicate it
1728 		 * is being recycled? Yes, but that creates one more
1729 		 * check in the fast lookup path.
1730 		 */
1731 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1732 			/*
1733 			 * Pretty serious shortage now.
1734 			 */
1735 			state->id_ah_op = IBD_OP_NOTSTARTED;
1736 			mutex_exit(&state->id_ac_mutex);
1737 			DPRINT(10, "ibd_async_acache : failed to find AH "
1738 			    "slot\n");
1739 			return;
1740 		}
1741 		/*
1742 		 * We could check whether ac_mce points to a SendOnly
1743 		 * member and drop that membership now. Or do it lazily
1744 		 * at detach time.
1745 		 */
1746 		ce->ac_mce = NULL;
1747 	}
1748 	mutex_exit(&state->id_ac_mutex);
1749 	ASSERT(ce->ac_mce == NULL);
1750 
1751 	/*
1752 	 * Update the entry.
1753 	 */
1754 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1755 
1756 	bzero(&path_info, sizeof (path_info));
1757 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1758 	path_attr.pa_sgid = state->id_sgid;
1759 	path_attr.pa_num_dgids = 1;
1760 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1761 	path_attr.pa_dgids = &destgid;
1762 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1763 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1764 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1765 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1766 		goto error;
1767 	}
1768 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1769 	    ntohl(ce->ac_mac.ipoib_qpn),
1770 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1771 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1772 		goto error;
1773 	}
1774 
1775 	/*
1776 	 * mce is set whenever an AH is being associated with a
1777 	 * MCG; this will come in handy when we leave the MCG. The
1778 	 * lock protects Tx fastpath from scanning the active list.
1779 	 */
1780 	if (mce != NULL)
1781 		ce->ac_mce = mce;
1782 	mutex_enter(&state->id_ac_mutex);
1783 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1784 	state->id_ah_op = ret;
1785 	mutex_exit(&state->id_ac_mutex);
1786 	return;
1787 error:
1788 	/*
1789 	 * We might want to drop SendOnly membership here if we
1790 	 * joined above. The lock protects Tx callbacks inserting
1791 	 * into the free list.
1792 	 */
1793 	mutex_enter(&state->id_ac_mutex);
1794 	state->id_ah_op = IBD_OP_ERRORED;
1795 	IBD_ACACHE_INSERT_FREE(state, ce);
1796 	mutex_exit(&state->id_ac_mutex);
1797 }
1798 
1799 /*
1800  * While restoring port's presence on the subnet on a port up, it is possible
1801  * that the port goes down again.
1802  */
1803 static void
1804 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1805 {
1806 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1807 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1808 	    LINK_STATE_UP;
1809 	ibd_mce_t *mce, *pmce;
1810 	ibd_ace_t *ace, *pace;
1811 
1812 	DPRINT(10, "ibd_async_link(): %d", opcode);
1813 
1814 	/*
1815 	 * On a link up, revalidate the link speed/width. No point doing
1816 	 * this on a link down, since we will be unable to do SA operations,
1817 	 * defaulting to the lowest speed. Also notice that we update our
1818 	 * notion of speed before calling mac_link_update(), which will do
1819 	 * neccesary higher level notifications for speed changes.
1820 	 */
1821 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1822 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1823 		state->id_link_speed = ibd_get_portspeed(state);
1824 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1825 	}
1826 
1827 	/*
1828 	 * Do all the work required to establish our presence on
1829 	 * the subnet.
1830 	 */
1831 	if (opcode == IBD_LINK_UP_ABSENT) {
1832 		/*
1833 		 * If in promiscuous mode ...
1834 		 */
1835 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1836 			/*
1837 			 * Drop all nonmembership.
1838 			 */
1839 			ibd_async_unsetprom(state);
1840 
1841 			/*
1842 			 * Then, try to regain nonmembership to all mcg's.
1843 			 */
1844 			ibd_async_setprom(state);
1845 
1846 		}
1847 
1848 		/*
1849 		 * Drop all sendonly membership (which also gets rid of the
1850 		 * AHs); try to reacquire all full membership.
1851 		 */
1852 		mce = list_head(&state->id_mc_full);
1853 		while ((pmce = mce) != NULL) {
1854 			mce = list_next(&state->id_mc_full, mce);
1855 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1856 				ibd_leave_group(state,
1857 				    pmce->mc_info.mc_adds_vect.av_dgid,
1858 				    IB_MC_JSTATE_SEND_ONLY_NON);
1859 			else
1860 				ibd_reacquire_group(state, pmce);
1861 		}
1862 
1863 		/*
1864 		 * Recycle all active AHs to free list (and if there are
1865 		 * pending posts, make sure they will go into the free list
1866 		 * once the Tx's complete). Grab the lock to prevent
1867 		 * concurrent Tx's as well as Tx cleanups.
1868 		 */
1869 		mutex_enter(&state->id_ac_mutex);
1870 		ace = list_head(&state->id_ah_active);
1871 		while ((pace = ace) != NULL) {
1872 			boolean_t cycled;
1873 
1874 			ace = list_next(&state->id_ah_active, ace);
1875 			mce = pace->ac_mce;
1876 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1877 			    B_TRUE);
1878 			/*
1879 			 * If this is for an mcg, it must be for a fullmember,
1880 			 * since we got rid of send-only members above when
1881 			 * processing the mce list.
1882 			 */
1883 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1884 			    IB_MC_JSTATE_FULL)));
1885 
1886 			/*
1887 			 * Check if the fullmember mce needs to be torn down,
1888 			 * ie whether the DLPI disable has already been done.
1889 			 * If so, do some of the work of tx_cleanup, namely
1890 			 * causing leave (which will fail), detach and
1891 			 * mce-freeing. tx_cleanup will put the AH into free
1892 			 * list. The reason to duplicate some of this
1893 			 * tx_cleanup work is because we want to delete the
1894 			 * AH right now instead of waiting for tx_cleanup, to
1895 			 * force subsequent Tx's to reacquire an AH.
1896 			 */
1897 			if ((mce != NULL) && (mce->mc_fullreap))
1898 				ibd_async_reap_group(state, mce,
1899 				    mce->mc_info.mc_adds_vect.av_dgid,
1900 				    mce->mc_jstate);
1901 		}
1902 		mutex_exit(&state->id_ac_mutex);
1903 	}
1904 
1905 	/*
1906 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1907 	 * (which stops further events from being delivered) before
1908 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1909 	 * has already been done.
1910 	 */
1911 	mutex_enter(&state->id_link_mutex);
1912 	state->id_link_state = lstate;
1913 	mac_link_update(state->id_mh, lstate);
1914 	mutex_exit(&state->id_link_mutex);
1915 
1916 	ibd_async_done(state);
1917 }
1918 
1919 /*
1920  * When the link is notified up, we need to do a few things, based
1921  * on the port's current p_init_type_reply claiming a reinit has been
1922  * done or not. The reinit steps are:
1923  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1924  *    the old Pkey and GID0 are correct.
1925  * 2. Register for mcg traps (already done by ibmf).
1926  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1927  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1928  * 4. Give up all sendonly memberships.
1929  * 5. Acquire all full memberships.
1930  * 6. In promiscuous mode, acquire all non memberships.
1931  * 7. Recycle all AHs to free list.
1932  */
1933 static void
1934 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1935 {
1936 	ibt_hca_portinfo_t *port_infop;
1937 	ibt_status_t ibt_status;
1938 	uint_t psize, port_infosz;
1939 	ibd_link_op_t opcode;
1940 	ibd_req_t *req;
1941 
1942 	/*
1943 	 * Do not send a request to the async daemon if it has not
1944 	 * yet been created or is being destroyed. If the async
1945 	 * daemon has not yet been created, we still need to track
1946 	 * last known state of the link. If this code races with the
1947 	 * detach path, then we are assured that the detach path has
1948 	 * not yet done the ibt_close_hca (which waits for all async
1949 	 * events to complete). If the code races with the attach path,
1950 	 * we need to validate the pkey/gid (in the link_up case) if
1951 	 * the initialization path has already set these up and created
1952 	 * IBTF resources based on the values.
1953 	 */
1954 	mutex_enter(&state->id_link_mutex);
1955 
1956 	/*
1957 	 * If the init code in ibd_drv_init hasn't yet set up the
1958 	 * pkey/gid, nothing to do; that code will set the link state.
1959 	 */
1960 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
1961 		mutex_exit(&state->id_link_mutex);
1962 		return;
1963 	}
1964 
1965 	if ((code == IBT_EVENT_PORT_UP) || (code == IBT_CLNT_REREG_EVENT) ||
1966 	    (code == IBT_PORT_CHANGE_EVENT)) {
1967 		uint8_t itreply;
1968 		boolean_t badup = B_FALSE;
1969 
1970 		ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
1971 		    state->id_port, &port_infop, &psize, &port_infosz);
1972 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
1973 			mutex_exit(&state->id_link_mutex);
1974 			DPRINT(10, "ibd_link_up : failed in"
1975 			    " ibt_query_port()\n");
1976 			return;
1977 		}
1978 
1979 		/*
1980 		 * If the link already went down by the time the handler gets
1981 		 * here, give up; we can not even validate pkey/gid since those
1982 		 * are not valid.
1983 		 */
1984 		if (port_infop->p_linkstate != IBT_PORT_ACTIVE)
1985 			badup = B_TRUE;
1986 
1987 		itreply = port_infop->p_init_type_reply;
1988 
1989 		/*
1990 		 * In InitTypeReply, check if NoLoadReply ==
1991 		 * PreserveContentReply == 0, in which case, verify Pkey/GID0.
1992 		 */
1993 		if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
1994 		    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
1995 		    (!badup)) {
1996 			/*
1997 			 * Check that the subnet part of GID0 has not changed.
1998 			 */
1999 			if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid,
2000 			    sizeof (ib_gid_t)) != 0)
2001 				badup = B_TRUE;
2002 
2003 			/*
2004 			 * Check that Pkey/index mapping is still valid.
2005 			 */
2006 			if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) ||
2007 			    (port_infop->p_pkey_tbl[state->id_pkix] !=
2008 			    state->id_pkey))
2009 				badup = B_TRUE;
2010 		}
2011 
2012 		/*
2013 		 * In InitTypeReply, if PreservePresenceReply indicates the SM
2014 		 * has ensured that the port's presence in mcg, traps etc is
2015 		 * intact, nothing more to do.
2016 		 */
2017 		opcode = IBD_LINK_UP_ABSENT;
2018 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2019 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY)
2020 			opcode = IBD_LINK_UP;
2021 
2022 		ibt_free_portinfo(port_infop, port_infosz);
2023 
2024 		if (badup) {
2025 			code = IBT_ERROR_PORT_DOWN;
2026 		} else if (code == IBT_PORT_CHANGE_EVENT) {
2027 			mutex_exit(&state->id_link_mutex);
2028 			return;
2029 		}
2030 	}
2031 
2032 	if (!ibd_async_safe(state)) {
2033 		state->id_link_state = (((code == IBT_EVENT_PORT_UP) ||
2034 		    (code == IBT_CLNT_REREG_EVENT)) ?  LINK_STATE_UP :
2035 		    LINK_STATE_DOWN);
2036 		mutex_exit(&state->id_link_mutex);
2037 		return;
2038 	}
2039 	mutex_exit(&state->id_link_mutex);
2040 
2041 	if (code == IBT_ERROR_PORT_DOWN)
2042 		opcode = IBD_LINK_DOWN;
2043 
2044 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2045 	req->rq_ptr = (void *)opcode;
2046 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2047 }
2048 
2049 /*
2050  * For the port up/down events, IBTL guarantees there will not be concurrent
2051  * invocations of the handler. IBTL might coalesce link transition events,
2052  * and not invoke the handler for _each_ up/down transition, but it will
2053  * invoke the handler with last known state
2054  */
2055 static void
2056 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2057     ibt_async_code_t code, ibt_async_event_t *event)
2058 {
2059 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2060 
2061 	switch (code) {
2062 	case IBT_ERROR_CATASTROPHIC_CHAN:
2063 		ibd_print_warn(state, "catastrophic channel error");
2064 		break;
2065 	case IBT_ERROR_CQ:
2066 		ibd_print_warn(state, "completion queue error");
2067 		break;
2068 	case IBT_PORT_CHANGE_EVENT:
2069 		/*
2070 		 * Events will be delivered to all instances that have
2071 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2072 		 * Only need to do work for our port; IBTF will deliver
2073 		 * events for other ports on the hca we have ibt_open_hca'ed
2074 		 * too. Note that ibd_drv_init() initializes id_port before
2075 		 * doing ibt_open_hca().
2076 		 */
2077 		ASSERT(state->id_hca_hdl == hca_hdl);
2078 		if (state->id_port != event->ev_port)
2079 			break;
2080 
2081 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2082 		    IBT_PORT_CHANGE_PKEY) {
2083 			ibd_link_mod(state, code);
2084 		}
2085 		break;
2086 	case IBT_ERROR_PORT_DOWN:
2087 	case IBT_CLNT_REREG_EVENT:
2088 	case IBT_EVENT_PORT_UP:
2089 		/*
2090 		 * Events will be delivered to all instances that have
2091 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2092 		 * Only need to do work for our port; IBTF will deliver
2093 		 * events for other ports on the hca we have ibt_open_hca'ed
2094 		 * too. Note that ibd_drv_init() initializes id_port before
2095 		 * doing ibt_open_hca().
2096 		 */
2097 		ASSERT(state->id_hca_hdl == hca_hdl);
2098 		if (state->id_port != event->ev_port)
2099 			break;
2100 
2101 		ibd_link_mod(state, code);
2102 		break;
2103 
2104 	case IBT_HCA_ATTACH_EVENT:
2105 	case IBT_HCA_DETACH_EVENT:
2106 		/*
2107 		 * When a new card is plugged to the system, attach_event is
2108 		 * invoked. Additionally, a cfgadm needs to be run to make the
2109 		 * card known to the system, and an ifconfig needs to be run to
2110 		 * plumb up any ibd interfaces on the card. In the case of card
2111 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2112 		 * unplumb the ibd interfaces on the card; when the card is
2113 		 * actually unplugged, the detach_event is invoked;
2114 		 * additionally, if any ibd instances are still active on the
2115 		 * card (eg there were no associated RCM scripts), driver's
2116 		 * detach routine is invoked.
2117 		 */
2118 		break;
2119 	default:
2120 		break;
2121 	}
2122 }
2123 
2124 /*
2125  * Attach device to the IO framework.
2126  */
2127 static int
2128 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2129 {
2130 	mac_register_t *macp;
2131 	ibd_state_t *state;
2132 	int instance;
2133 	int err;
2134 
2135 	switch (cmd) {
2136 		case DDI_ATTACH:
2137 			break;
2138 		case DDI_RESUME:
2139 			/* This driver does not support resume */
2140 		default:
2141 			return (DDI_FAILURE);
2142 	}
2143 
2144 	/*
2145 	 * Allocate soft device data structure
2146 	 */
2147 	instance = ddi_get_instance(dip);
2148 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2149 		return (DDI_FAILURE);
2150 	state = ddi_get_soft_state(ibd_list, instance);
2151 
2152 	/* pre ibt_attach() soft state initialization */
2153 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2154 		DPRINT(10, "ibd_attach : failed in ibd_state_init()");
2155 		goto attach_fail_state_init;
2156 	}
2157 
2158 	/* alloc rx soft intr */
2159 	if ((ibd_rx_softintr == 1) &&
2160 	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2161 	    NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) {
2162 		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
2163 		goto attach_fail_ddi_add_rx_softintr;
2164 	}
2165 
2166 	/* alloc tx soft intr */
2167 	if ((ibd_tx_softintr == 1) &&
2168 	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2169 	    NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) {
2170 		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
2171 		goto attach_fail_ddi_add_tx_softintr;
2172 	}
2173 
2174 	/* "attach" to IBTL */
2175 	if (ibt_attach(&ibd_clnt_modinfo, dip, state,
2176 	    &state->id_ibt_hdl) != IBT_SUCCESS) {
2177 		DPRINT(10, "ibd_attach : failed in ibt_attach()");
2178 		goto attach_fail_ibt_attach;
2179 	}
2180 
2181 	/* Finish initializing this driver */
2182 	if (ibd_drv_init(state) != DDI_SUCCESS) {
2183 		DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n");
2184 		goto attach_fail_drv_init;
2185 	}
2186 
2187 	/*
2188 	 * Initialize pointers to device specific functions which will be
2189 	 * used by the generic layer.
2190 	 */
2191 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2192 		DPRINT(10, "ibd_attach : failed in mac_alloc()");
2193 		goto attach_fail_drv_init;
2194 	}
2195 
2196 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2197 	macp->m_driver = state;
2198 	macp->m_dip = state->id_dip;
2199 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2200 	macp->m_callbacks = &ib_m_callbacks;
2201 	macp->m_min_sdu = 0;
2202 	macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE;
2203 
2204 	/*
2205 	 *  Register ourselves with the GLDv3 interface
2206 	 */
2207 	err = mac_register(macp, &state->id_mh);
2208 	mac_free(macp);
2209 	if (err != 0) {
2210 		DPRINT(10, "ibd_attach : failed in mac_register()");
2211 		goto attach_fail_mac_register;
2212 	}
2213 
2214 	/*
2215 	 * Setup the handler we will use for regular DLPI stuff. Its important
2216 	 * to setup the recv handler after registering with gldv3.
2217 	 */
2218 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2219 	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
2220 	    IBT_SUCCESS) {
2221 		DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n");
2222 		goto attach_fail_setup_handler;
2223 	}
2224 
2225 	/*
2226 	 * Setup the subnet notices handler after we initialize the a/mcaches
2227 	 * and start the async thread, both of which are required for the
2228 	 * trap handler to function properly. Enable the trap handler to
2229 	 * queue requests to the async thread after the mac_register, because
2230 	 * the async daemon invokes mac_tx_update(), which must be done after
2231 	 * mac_register().
2232 	 */
2233 	ibt_register_subnet_notices(state->id_ibt_hdl,
2234 	    ibd_snet_notices_handler, state);
2235 	mutex_enter(&state->id_trap_lock);
2236 	state->id_trap_stop = B_FALSE;
2237 	mutex_exit(&state->id_trap_lock);
2238 
2239 	/*
2240 	 * Indicate link status to GLDv3 and higher layers. By default,
2241 	 * we assume we are in up state (which must have been true at
2242 	 * least at the time the broadcast mcg's were probed); if there
2243 	 * were any up/down transitions till the time we come here, the
2244 	 * async handler will have updated last known state, which we
2245 	 * use to tell GLDv3. The async handler will not send any
2246 	 * notifications to GLDv3 till we reach here in the initialization
2247 	 * sequence.
2248 	 */
2249 	mac_link_update(state->id_mh, state->id_link_state);
2250 
2251 	return (DDI_SUCCESS);
2252 
2253 	/* Attach failure points, cleanup */
2254 attach_fail_setup_handler:
2255 	(void) mac_unregister(state->id_mh);
2256 
2257 attach_fail_mac_register:
2258 	ibd_drv_fini(state);
2259 
2260 attach_fail_drv_init:
2261 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2262 		ibd_print_warn(state, "failed to free IB resources");
2263 
2264 attach_fail_ibt_attach:
2265 	if (ibd_tx_softintr == 1)
2266 		ddi_remove_softintr(state->id_tx);
2267 
2268 attach_fail_ddi_add_tx_softintr:
2269 	if (ibd_rx_softintr == 1)
2270 		ddi_remove_softintr(state->id_rx);
2271 
2272 attach_fail_ddi_add_rx_softintr:
2273 	ibd_state_fini(state);
2274 
2275 attach_fail_state_init:
2276 	ddi_soft_state_free(ibd_list, instance);
2277 
2278 	return (DDI_FAILURE);
2279 }
2280 
2281 /*
2282  * Detach device from the IO framework.
2283  */
2284 static int
2285 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2286 {
2287 	ibd_state_t *state;
2288 	int status;
2289 	int instance;
2290 
2291 	switch (cmd) {
2292 		case DDI_DETACH:
2293 			break;
2294 		case DDI_SUSPEND:
2295 		default:
2296 			return (DDI_FAILURE);
2297 	}
2298 
2299 	instance = ddi_get_instance(dip);
2300 	state = ddi_get_soft_state(ibd_list, instance);
2301 
2302 	/*
2303 	 * First, stop receive interrupts; this stops the
2304 	 * driver from handing up buffers to higher layers.
2305 	 * Wait for receive buffers to be returned; give up
2306 	 * after 5 seconds.
2307 	 */
2308 	ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
2309 	status = 50;
2310 	while (state->id_rx_list.dl_bufs_outstanding > 0) {
2311 		delay(drv_usectohz(100000));
2312 		if (--status == 0) {
2313 			DPRINT(2, "ibd_detach : reclaiming failed");
2314 			goto failed;
2315 		}
2316 	}
2317 
2318 	if (mac_unregister(state->id_mh) != DDI_SUCCESS) {
2319 		DPRINT(10, "ibd_detach : failed in mac_unregister()");
2320 		goto failed;
2321 	}
2322 
2323 	if (ibd_rx_softintr == 1)
2324 		ddi_remove_softintr(state->id_rx);
2325 
2326 	if (ibd_tx_softintr == 1)
2327 		ddi_remove_softintr(state->id_tx);
2328 
2329 	ibd_drv_fini(state);
2330 
2331 	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
2332 		ibd_print_warn(state, "failed to free all IB resources at "
2333 		    "driver detach time");
2334 
2335 	ibd_state_fini(state);
2336 	ddi_soft_state_free(ibd_list, instance);
2337 	return (DDI_SUCCESS);
2338 
2339 failed:
2340 	/*
2341 	 * Reap all the Tx/Rx completions that were posted since we
2342 	 * turned off the notification. Turn on notifications. There
2343 	 * is a race in that we do not reap completions that come in
2344 	 * after the poll and before notifications get turned on. That
2345 	 * is okay, the next rx/tx packet will trigger a completion
2346 	 * that will reap any missed completions.
2347 	 */
2348 	ibd_poll_compq(state, state->id_rcq_hdl);
2349 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
2350 	return (DDI_FAILURE);
2351 }
2352 
2353 /*
2354  * Pre ibt_attach() driver initialization
2355  */
2356 static int
2357 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2358 {
2359 	char buf[64];
2360 
2361 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2362 	state->id_link_state = LINK_STATE_UNKNOWN;
2363 
2364 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2365 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2366 	state->id_trap_stop = B_TRUE;
2367 	state->id_trap_inprog = 0;
2368 
2369 	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2370 	state->id_dip = dip;
2371 
2372 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2373 
2374 	state->id_tx_list.dl_head = NULL;
2375 	state->id_tx_list.dl_tail = NULL;
2376 	state->id_tx_list.dl_pending_sends = B_FALSE;
2377 	state->id_tx_list.dl_cnt = 0;
2378 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2379 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2380 	state->id_tx_busy = 0;
2381 
2382 	state->id_rx_list.dl_head = NULL;
2383 	state->id_rx_list.dl_tail = NULL;
2384 	state->id_rx_list.dl_bufs_outstanding = 0;
2385 	state->id_rx_list.dl_cnt = 0;
2386 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2387 	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
2388 
2389 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2390 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2391 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2392 
2393 #ifdef IBD_LOGGING
2394 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
2395 #endif
2396 
2397 	return (DDI_SUCCESS);
2398 }
2399 
2400 /*
2401  * Post ibt_detach() driver deconstruction
2402  */
2403 static void
2404 ibd_state_fini(ibd_state_t *state)
2405 {
2406 	kmem_cache_destroy(state->id_req_kmc);
2407 
2408 	mutex_destroy(&state->id_rxpost_lock);
2409 	mutex_destroy(&state->id_rx_list.dl_mutex);
2410 
2411 	mutex_destroy(&state->id_txpost_lock);
2412 	mutex_destroy(&state->id_tx_list.dl_mutex);
2413 
2414 	mutex_destroy(&state->id_sched_lock);
2415 	mutex_destroy(&state->id_cq_poll_lock);
2416 
2417 	cv_destroy(&state->id_trap_cv);
2418 	mutex_destroy(&state->id_trap_lock);
2419 	mutex_destroy(&state->id_link_mutex);
2420 
2421 #ifdef IBD_LOGGING
2422 	mutex_destroy(&ibd_lbuf_lock);
2423 #endif
2424 }
2425 
2426 /*
2427  * Fetch IBA parameters for the network device from IB nexus.
2428  */
2429 static int
2430 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid)
2431 {
2432 	/*
2433 	 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
2434 	 * Note that the default partition is also allowed.
2435 	 */
2436 	state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2437 	    0, "port-pkey", IB_PKEY_INVALID_LIMITED);
2438 	if (state->id_pkey <= IB_PKEY_INVALID_FULL) {
2439 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong"
2440 		    "partition\n");
2441 		return (DDI_FAILURE);
2442 	}
2443 
2444 	/*
2445 	 * ... the IBA port ...
2446 	 */
2447 	state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
2448 	    0, "port-number", 0);
2449 	if (state->id_port == 0) {
2450 		DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n");
2451 		return (DDI_FAILURE);
2452 	}
2453 
2454 	/*
2455 	 * ... and HCA GUID.
2456 	 */
2457 	*hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
2458 	    0, "hca-guid", 0);
2459 	if (*hca_guid == 0) {
2460 		DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong "
2461 		    "guid\n");
2462 		return (DDI_FAILURE);
2463 	}
2464 
2465 	return (DDI_SUCCESS);
2466 }
2467 
2468 /*
2469  * Fetch link speed from SA for snmp ifspeed reporting.
2470  */
2471 static uint64_t
2472 ibd_get_portspeed(ibd_state_t *state)
2473 {
2474 	int			ret;
2475 	ibt_path_info_t		path;
2476 	ibt_path_attr_t		path_attr;
2477 	uint8_t			num_paths;
2478 	uint64_t		ifspeed;
2479 
2480 	/*
2481 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2482 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2483 	 * 2000000000. Start with that as default.
2484 	 */
2485 	ifspeed = 2000000000;
2486 
2487 	bzero(&path_attr, sizeof (path_attr));
2488 
2489 	/*
2490 	 * Get the port speed from Loopback path information.
2491 	 */
2492 	path_attr.pa_dgids = &state->id_sgid;
2493 	path_attr.pa_num_dgids = 1;
2494 	path_attr.pa_sgid = state->id_sgid;
2495 
2496 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2497 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2498 		goto earlydone;
2499 
2500 	if (num_paths < 1)
2501 		goto earlydone;
2502 
2503 	/*
2504 	 * In case SA does not return an expected value, report the default
2505 	 * speed as 1X.
2506 	 */
2507 	ret = 1;
2508 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2509 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2510 			ret = 1;
2511 			break;
2512 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2513 			ret = 4;
2514 			break;
2515 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2516 			ret = 12;
2517 			break;
2518 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2519 			ret = 2;
2520 			break;
2521 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2522 			ret = 8;
2523 			break;
2524 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2525 			ret = 16;
2526 			break;
2527 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2528 			ret = 24;
2529 			break;
2530 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2531 			ret = 32;
2532 			break;
2533 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2534 			ret = 48;
2535 			break;
2536 	}
2537 
2538 	ifspeed *= ret;
2539 
2540 earlydone:
2541 	return (ifspeed);
2542 }
2543 
2544 /*
2545  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2546  * representing the input mcg mgid.
2547  */
2548 static ibd_mce_t *
2549 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2550 {
2551 	ibd_mce_t *ptr = list_head(mlist);
2552 
2553 	/*
2554 	 * Do plain linear search.
2555 	 */
2556 	while (ptr != NULL) {
2557 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2558 		    sizeof (ib_gid_t)) == 0)
2559 			return (ptr);
2560 		ptr = list_next(mlist, ptr);
2561 	}
2562 	return (NULL);
2563 }
2564 
2565 /*
2566  * Execute IBA JOIN.
2567  */
2568 static ibt_status_t
2569 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2570 {
2571 	ibt_mcg_attr_t mcg_attr;
2572 
2573 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2574 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2575 	mcg_attr.mc_mgid = mgid;
2576 	mcg_attr.mc_join_state = mce->mc_jstate;
2577 	mcg_attr.mc_scope = state->id_scope;
2578 	mcg_attr.mc_pkey = state->id_pkey;
2579 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2580 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2581 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2582 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2583 	    NULL, NULL));
2584 }
2585 
2586 /*
2587  * This code JOINs the port in the proper way (depending on the join
2588  * state) so that IBA fabric will forward mcg packets to/from the port.
2589  * It also attaches the QPN to the mcg so it can receive those mcg
2590  * packets. This code makes sure not to attach the mcg to the QP if
2591  * that has been previously done due to the mcg being joined with a
2592  * different join state, even though this is not required by SWG_0216,
2593  * refid 3610.
2594  */
2595 static ibd_mce_t *
2596 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2597 {
2598 	ibt_status_t ibt_status;
2599 	ibd_mce_t *mce, *tmce, *omce = NULL;
2600 	boolean_t do_attach = B_TRUE;
2601 
2602 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2603 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2604 
2605 	/*
2606 	 * For enable_multicast Full member joins, we need to do some
2607 	 * extra work. If there is already an mce on the list that
2608 	 * indicates full membership, that means the membership has
2609 	 * not yet been dropped (since the disable_multicast was issued)
2610 	 * because there are pending Tx's to the mcg; in that case, just
2611 	 * mark the mce not to be reaped when the Tx completion queues
2612 	 * an async reap operation.
2613 	 *
2614 	 * If there is already an mce on the list indicating sendonly
2615 	 * membership, try to promote to full membership. Be careful
2616 	 * not to deallocate the old mce, since there might be an AH
2617 	 * pointing to it; instead, update the old mce with new data
2618 	 * that tracks the full membership.
2619 	 */
2620 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2621 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2622 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2623 			ASSERT(omce->mc_fullreap);
2624 			omce->mc_fullreap = B_FALSE;
2625 			return (omce);
2626 		} else {
2627 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2628 		}
2629 	}
2630 
2631 	/*
2632 	 * Allocate the ibd_mce_t to track this JOIN.
2633 	 */
2634 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2635 	mce->mc_fullreap = B_FALSE;
2636 	mce->mc_jstate = jstate;
2637 
2638 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2639 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2640 		    ibt_status);
2641 		kmem_free(mce, sizeof (ibd_mce_t));
2642 		return (NULL);
2643 	}
2644 
2645 	/*
2646 	 * Is an IBA attach required? Not if the interface is already joined
2647 	 * to the mcg in a different appropriate join state.
2648 	 */
2649 	if (jstate == IB_MC_JSTATE_NON) {
2650 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2651 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2652 			do_attach = B_FALSE;
2653 	} else if (jstate == IB_MC_JSTATE_FULL) {
2654 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2655 			do_attach = B_FALSE;
2656 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2657 		do_attach = B_FALSE;
2658 	}
2659 
2660 	if (do_attach) {
2661 		/*
2662 		 * Do the IBA attach.
2663 		 */
2664 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2665 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2666 		    &mce->mc_info)) != IBT_SUCCESS) {
2667 			DPRINT(10, "ibd_join_group : failed qp attachment "
2668 			    "%d\n", ibt_status);
2669 			/*
2670 			 * NOTE that we should probably preserve the join info
2671 			 * in the list and later try to leave again at detach
2672 			 * time.
2673 			 */
2674 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2675 			    state->id_sgid, jstate);
2676 			kmem_free(mce, sizeof (ibd_mce_t));
2677 			return (NULL);
2678 		}
2679 	}
2680 
2681 	/*
2682 	 * Insert the ibd_mce_t in the proper list.
2683 	 */
2684 	if (jstate == IB_MC_JSTATE_NON) {
2685 		IBD_MCACHE_INSERT_NON(state, mce);
2686 	} else {
2687 		/*
2688 		 * Set up the mc_req fields used for reaping the
2689 		 * mcg in case of delayed tx completion (see
2690 		 * ibd_tx_cleanup()). Also done for sendonly join in
2691 		 * case we are promoted to fullmembership later and
2692 		 * keep using the same mce.
2693 		 */
2694 		mce->mc_req.rq_gid = mgid;
2695 		mce->mc_req.rq_ptr = mce;
2696 		/*
2697 		 * Check whether this is the case of trying to join
2698 		 * full member, and we were already joined send only.
2699 		 * We try to drop our SendOnly membership, but it is
2700 		 * possible that the mcg does not exist anymore (and
2701 		 * the subnet trap never reached us), so the leave
2702 		 * operation might fail.
2703 		 */
2704 		if (omce != NULL) {
2705 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2706 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2707 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2708 			bcopy(&mce->mc_info, &omce->mc_info,
2709 			    sizeof (ibt_mcg_info_t));
2710 			kmem_free(mce, sizeof (ibd_mce_t));
2711 			return (omce);
2712 		}
2713 		mutex_enter(&state->id_mc_mutex);
2714 		IBD_MCACHE_INSERT_FULL(state, mce);
2715 		mutex_exit(&state->id_mc_mutex);
2716 	}
2717 
2718 	return (mce);
2719 }
2720 
2721 /*
2722  * Called during port up event handling to attempt to reacquire full
2723  * membership to an mcg. Stripped down version of ibd_join_group().
2724  * Note that it is possible that the mcg might have gone away, and
2725  * gets recreated at this point.
2726  */
2727 static void
2728 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2729 {
2730 	ib_gid_t mgid;
2731 
2732 	/*
2733 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2734 	 * reap/leave is going to try to leave the group. We could prevent
2735 	 * that by adding a boolean flag into ibd_mce_t, if required.
2736 	 */
2737 	if (mce->mc_fullreap)
2738 		return;
2739 
2740 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2741 
2742 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2743 	    mgid.gid_guid);
2744 
2745 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2746 		ibd_print_warn(state, "Failure on port up to rejoin "
2747 		    "multicast gid %016llx:%016llx",
2748 		    (u_longlong_t)mgid.gid_prefix,
2749 		    (u_longlong_t)mgid.gid_guid);
2750 }
2751 
2752 /*
2753  * This code handles delayed Tx completion cleanups for mcg's to which
2754  * disable_multicast has been issued, regular mcg related cleanups during
2755  * disable_multicast, disable_promiscous and mcg traps, as well as
2756  * cleanups during driver detach time. Depending on the join state,
2757  * it deletes the mce from the appropriate list and issues the IBA
2758  * leave/detach; except in the disable_multicast case when the mce
2759  * is left on the active list for a subsequent Tx completion cleanup.
2760  */
2761 static void
2762 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2763     uint8_t jstate)
2764 {
2765 	ibd_mce_t *tmce;
2766 	boolean_t do_detach = B_TRUE;
2767 
2768 	/*
2769 	 * Before detaching, we must check whether the other list
2770 	 * contains the mcg; if we detach blindly, the consumer
2771 	 * who set up the other list will also stop receiving
2772 	 * traffic.
2773 	 */
2774 	if (jstate == IB_MC_JSTATE_FULL) {
2775 		/*
2776 		 * The following check is only relevant while coming
2777 		 * from the Tx completion path in the reap case.
2778 		 */
2779 		if (!mce->mc_fullreap)
2780 			return;
2781 		mutex_enter(&state->id_mc_mutex);
2782 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2783 		mutex_exit(&state->id_mc_mutex);
2784 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2785 			do_detach = B_FALSE;
2786 	} else if (jstate == IB_MC_JSTATE_NON) {
2787 		IBD_MCACHE_PULLOUT_NON(state, mce);
2788 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2789 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2790 			do_detach = B_FALSE;
2791 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2792 		mutex_enter(&state->id_mc_mutex);
2793 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2794 		mutex_exit(&state->id_mc_mutex);
2795 		do_detach = B_FALSE;
2796 	}
2797 
2798 	/*
2799 	 * If we are reacting to a mcg trap and leaving our sendonly or
2800 	 * non membership, the mcg is possibly already gone, so attempting
2801 	 * to leave might fail. On the other hand, we must try to leave
2802 	 * anyway, since this might be a trap from long ago, and we could
2803 	 * have potentially sendonly joined to a recent incarnation of
2804 	 * the mcg and are about to loose track of this information.
2805 	 */
2806 	if (do_detach) {
2807 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2808 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2809 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2810 	}
2811 
2812 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2813 	kmem_free(mce, sizeof (ibd_mce_t));
2814 }
2815 
2816 /*
2817  * Async code executed due to multicast and promiscuous disable requests
2818  * and mcg trap handling; also executed during driver detach. Mostly, a
2819  * leave and detach is done; except for the fullmember case when Tx
2820  * requests are pending, whence arrangements are made for subsequent
2821  * cleanup on Tx completion.
2822  */
2823 static void
2824 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2825 {
2826 	ipoib_mac_t mcmac;
2827 	boolean_t recycled;
2828 	ibd_mce_t *mce;
2829 
2830 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
2831 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2832 
2833 	if (jstate == IB_MC_JSTATE_NON) {
2834 		recycled = B_TRUE;
2835 		mce = IBD_MCACHE_FIND_NON(state, mgid);
2836 		/*
2837 		 * In case we are handling a mcg trap, we might not find
2838 		 * the mcg in the non list.
2839 		 */
2840 		if (mce == NULL)
2841 			return;
2842 	} else {
2843 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
2844 
2845 		/*
2846 		 * In case we are handling a mcg trap, make sure the trap
2847 		 * is not arriving late; if we have an mce that indicates
2848 		 * that we are already a fullmember, that would be a clear
2849 		 * indication that the trap arrived late (ie, is for a
2850 		 * previous incarnation of the mcg).
2851 		 */
2852 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
2853 			if ((mce == NULL) || (mce->mc_jstate ==
2854 			    IB_MC_JSTATE_FULL))
2855 				return;
2856 		} else {
2857 			ASSERT(jstate == IB_MC_JSTATE_FULL);
2858 
2859 			/*
2860 			 * If join group failed, mce will be NULL here.
2861 			 * This is because in GLDv3 driver, set multicast
2862 			 *  will always return success.
2863 			 */
2864 			if (mce == NULL)
2865 				return;
2866 
2867 			mce->mc_fullreap = B_TRUE;
2868 		}
2869 
2870 		/*
2871 		 * If no pending Tx's remain that reference the AH
2872 		 * for the mcg, recycle it from active to free list.
2873 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
2874 		 * so the last completing Tx will cause an async reap
2875 		 * operation to be invoked, at which time we will drop our
2876 		 * membership to the mcg so that the pending Tx's complete
2877 		 * successfully. Refer to comments on "AH and MCE active
2878 		 * list manipulation" at top of this file. The lock protects
2879 		 * against Tx fast path and Tx cleanup code.
2880 		 */
2881 		mutex_enter(&state->id_ac_mutex);
2882 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
2883 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
2884 		    IB_MC_JSTATE_SEND_ONLY_NON));
2885 		mutex_exit(&state->id_ac_mutex);
2886 	}
2887 
2888 	if (recycled) {
2889 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
2890 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2891 		ibd_async_reap_group(state, mce, mgid, jstate);
2892 	}
2893 }
2894 
2895 /*
2896  * Find the broadcast address as defined by IPoIB; implicitly
2897  * determines the IBA scope, mtu, tclass etc of the link the
2898  * interface is going to be a member of.
2899  */
2900 static ibt_status_t
2901 ibd_find_bgroup(ibd_state_t *state)
2902 {
2903 	ibt_mcg_attr_t mcg_attr;
2904 	uint_t numg;
2905 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
2906 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
2907 	    IB_MC_SCOPE_GLOBAL };
2908 	int i, mcgmtu;
2909 	boolean_t found = B_FALSE;
2910 
2911 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2912 	mcg_attr.mc_pkey = state->id_pkey;
2913 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
2914 
2915 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
2916 		state->id_scope = mcg_attr.mc_scope = scopes[i];
2917 
2918 		/*
2919 		 * Look for the IPoIB broadcast group.
2920 		 */
2921 		state->id_mgid.gid_prefix =
2922 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
2923 		    ((uint64_t)state->id_scope << 48) |
2924 		    ((uint32_t)(state->id_pkey << 16)));
2925 		mcg_attr.mc_mgid = state->id_mgid;
2926 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
2927 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
2928 			found = B_TRUE;
2929 			break;
2930 		}
2931 
2932 	}
2933 
2934 	if (!found) {
2935 		ibd_print_warn(state, "IPoIB broadcast group absent");
2936 		return (IBT_FAILURE);
2937 	}
2938 
2939 	/*
2940 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
2941 	 */
2942 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
2943 	if (state->id_mtu < mcgmtu) {
2944 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
2945 		    "greater than port's maximum MTU %d", mcgmtu,
2946 		    state->id_mtu);
2947 		return (IBT_FAILURE);
2948 	}
2949 	state->id_mtu = mcgmtu;
2950 
2951 	return (IBT_SUCCESS);
2952 }
2953 
2954 /*
2955  * Post ibt_attach() initialization.
2956  */
2957 static int
2958 ibd_drv_init(ibd_state_t *state)
2959 {
2960 	kthread_t *kht;
2961 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
2962 	ibt_ud_chan_query_attr_t ud_chan_attr;
2963 	ibt_hca_portinfo_t *port_infop;
2964 	ibt_hca_attr_t hca_attrs;
2965 	ibt_status_t ibt_status;
2966 	ibt_cq_attr_t cq_attr;
2967 	ib_guid_t hca_guid;
2968 	uint32_t real_size;
2969 	uint32_t *ptr;
2970 	char pathname[OBP_MAXPATHLEN];
2971 	uint_t psize, port_infosz;
2972 
2973 	/*
2974 	 * Initialize id_port before ibt_open_hca because of
2975 	 * ordering requirements in port up/down handling.
2976 	 */
2977 	if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS)
2978 		return (DDI_FAILURE);
2979 
2980 	if (ibt_open_hca(state->id_ibt_hdl, hca_guid,
2981 	    &state->id_hca_hdl) != IBT_SUCCESS) {
2982 		DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n");
2983 		return (DDI_FAILURE);
2984 	}
2985 
2986 	mutex_enter(&state->id_link_mutex);
2987 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
2988 	    state->id_port, &port_infop, &psize,
2989 	    &port_infosz);
2990 	if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
2991 		mutex_exit(&state->id_link_mutex);
2992 		DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n");
2993 		(void) ibt_close_hca(state->id_hca_hdl);
2994 		return (DDI_FAILURE);
2995 	}
2996 
2997 	/*
2998 	 * If the link already went down by the time we get here, give up;
2999 	 * we can not even get the gid since that is not valid. We would
3000 	 * fail in ibd_find_bgroup() anyway.
3001 	 */
3002 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
3003 		mutex_exit(&state->id_link_mutex);
3004 		ibt_free_portinfo(port_infop, port_infosz);
3005 		(void) ibt_close_hca(state->id_hca_hdl);
3006 		ibd_print_warn(state, "Port is not active");
3007 		return (DDI_FAILURE);
3008 	}
3009 
3010 	/*
3011 	 * This verifies the Pkey ibnexus handed us is still valid.
3012 	 * This is also the point from which the pkey table for the
3013 	 * port must hold the exact pkey value at the exact index
3014 	 * across port up/downs.
3015 	 */
3016 	if (ibt_pkey2index(state->id_hca_hdl, state->id_port,
3017 	    state->id_pkey, &state->id_pkix) != IBT_SUCCESS) {
3018 		mutex_exit(&state->id_link_mutex);
3019 		ibt_free_portinfo(port_infop, port_infosz);
3020 		DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n");
3021 		(void) ibt_close_hca(state->id_hca_hdl);
3022 		return (DDI_FAILURE);
3023 	}
3024 
3025 	state->id_mtu = (128 << port_infop->p_mtu);
3026 	state->id_sgid = *port_infop->p_sgid_tbl;
3027 	state->id_link_state = LINK_STATE_UP;
3028 	mutex_exit(&state->id_link_mutex);
3029 
3030 	ibt_free_portinfo(port_infop, port_infosz);
3031 
3032 	state->id_link_speed = ibd_get_portspeed(state);
3033 
3034 	/*
3035 	 * Read drv conf and record what the policy is on enabling LSO
3036 	 */
3037 	if (ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
3038 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
3039 		state->id_lso_policy = B_TRUE;
3040 	} else {
3041 		state->id_lso_policy = B_FALSE;
3042 	}
3043 
3044 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
3045 	ASSERT(ibt_status == IBT_SUCCESS);
3046 
3047 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
3048 		DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n");
3049 		goto drv_init_fail_find_bgroup;
3050 	}
3051 
3052 	if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
3053 	    &state->id_pd_hdl) != IBT_SUCCESS) {
3054 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n");
3055 		goto drv_init_fail_alloc_pd;
3056 	}
3057 
3058 	/* Initialize the parallel ARP cache and AHs */
3059 	if (ibd_acache_init(state) != DDI_SUCCESS) {
3060 		DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n");
3061 		goto drv_init_fail_acache;
3062 	}
3063 
3064 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
3065 		state->id_hca_res_lkey_capab = 1;
3066 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
3067 	}
3068 
3069 	/*
3070 	 * Check various tunable limits.
3071 	 */
3072 
3073 	/*
3074 	 * See if extended sgl size information is provided by the hca; if yes,
3075 	 * use the correct one and set the maximum sqseg value.
3076 	 */
3077 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO)
3078 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
3079 	else
3080 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
3081 
3082 	/*
3083 	 * Set LSO capability and maximum length
3084 	 */
3085 	if (hca_attrs.hca_max_lso_size > 0) {
3086 		state->id_lso_capable = B_TRUE;
3087 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
3088 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
3089 		else
3090 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
3091 	} else {
3092 		state->id_lso_capable = B_FALSE;
3093 		state->id_lso_maxlen = 0;
3094 	}
3095 
3096 
3097 	/*
3098 	 * Check #r/s wqes against max channel size.
3099 	 */
3100 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE)
3101 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
3102 	else
3103 		state->id_num_rwqe = IBD_NUM_RWQE;
3104 
3105 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE)
3106 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
3107 	else
3108 		state->id_num_swqe = IBD_NUM_SWQE;
3109 
3110 	/*
3111 	 * Check the hardware checksum capability. Currently we only consider
3112 	 * full checksum offload.
3113 	 */
3114 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
3115 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
3116 	}
3117 
3118 	/*
3119 	 * Allocate Rx/combined CQ:
3120 	 * Theoretically, there is no point in having more than #rwqe
3121 	 * plus #swqe cqe's, except that the CQ will be signalled for
3122 	 * overflow when the last wqe completes, if none of the previous
3123 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
3124 	 * to make sure such overflow does not occur.
3125 	 */
3126 	cq_attr.cq_sched = NULL;
3127 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
3128 
3129 	if (ibd_separate_cqs == 1) {
3130 		/*
3131 		 * Allocate Receive CQ.
3132 		 */
3133 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
3134 			cq_attr.cq_size = state->id_num_rwqe + 1;
3135 		} else {
3136 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3137 			state->id_num_rwqe = cq_attr.cq_size - 1;
3138 		}
3139 
3140 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3141 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3142 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3143 			goto drv_init_fail_alloc_rcq;
3144 		}
3145 
3146 		if (ibt_modify_cq(state->id_rcq_hdl,
3147 		    ibd_rxcomp_count, ibd_rxcomp_usec, 0) != IBT_SUCCESS) {
3148 			DPRINT(10, "ibd_drv_init: Receive CQ interrupt "
3149 			    "moderation failed\n");
3150 		}
3151 
3152 		state->id_rxwcs_size = state->id_num_rwqe + 1;
3153 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
3154 		    state->id_rxwcs_size, KM_SLEEP);
3155 
3156 		/*
3157 		 * Allocate Send CQ.
3158 		 */
3159 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
3160 			cq_attr.cq_size = state->id_num_swqe + 1;
3161 		} else {
3162 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3163 			state->id_num_swqe = cq_attr.cq_size - 1;
3164 		}
3165 
3166 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3167 		    &state->id_scq_hdl, &real_size) != IBT_SUCCESS) {
3168 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3169 			goto drv_init_fail_alloc_scq;
3170 		}
3171 		if (ibt_modify_cq(state->id_scq_hdl,
3172 		    10, 300, 0) != IBT_SUCCESS) {
3173 			DPRINT(10, "ibd_drv_init: Send CQ interrupt "
3174 			    "moderation failed\n");
3175 		}
3176 
3177 		state->id_txwcs_size = state->id_num_swqe + 1;
3178 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
3179 		    state->id_txwcs_size, KM_SLEEP);
3180 	} else {
3181 		/*
3182 		 * Allocate combined Send/Receive CQ.
3183 		 */
3184 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
3185 		    state->id_num_swqe + 1)) {
3186 			cq_attr.cq_size = state->id_num_rwqe +
3187 			    state->id_num_swqe + 1;
3188 		} else {
3189 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
3190 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
3191 			    state->id_num_rwqe) / (state->id_num_rwqe +
3192 			    state->id_num_swqe);
3193 			state->id_num_swqe = cq_attr.cq_size - 1 -
3194 			    state->id_num_rwqe;
3195 		}
3196 
3197 		state->id_rxwcs_size = cq_attr.cq_size;
3198 		state->id_txwcs_size = state->id_rxwcs_size;
3199 
3200 		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
3201 		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
3202 			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
3203 			goto drv_init_fail_alloc_rcq;
3204 		}
3205 		state->id_scq_hdl = state->id_rcq_hdl;
3206 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
3207 		    state->id_rxwcs_size, KM_SLEEP);
3208 		state->id_txwcs = state->id_rxwcs;
3209 	}
3210 
3211 	/*
3212 	 * Print message in case we could not allocate as many wqe's
3213 	 * as was requested. Note that in the combined CQ case, we will
3214 	 * get the following message.
3215 	 */
3216 	if (state->id_num_rwqe != IBD_NUM_RWQE)
3217 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
3218 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
3219 	if (state->id_num_swqe != IBD_NUM_SWQE)
3220 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
3221 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
3222 
3223 	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
3224 	if (state->id_hca_res_lkey_capab)
3225 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
3226 	if (state->id_lso_policy && state->id_lso_capable)
3227 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
3228 
3229 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
3230 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
3231 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
3232 	ud_alloc_attr.ud_sizes.cs_sq	= state->id_num_swqe;
3233 	ud_alloc_attr.ud_sizes.cs_rq	= state->id_num_rwqe;
3234 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
3235 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
3236 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
3237 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
3238 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
3239 	ud_alloc_attr.ud_clone_chan	= NULL;
3240 
3241 	if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
3242 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) {
3243 		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()"
3244 		    "\n");
3245 		goto drv_init_fail_alloc_chan;
3246 	}
3247 
3248 	if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) !=
3249 	    DDI_SUCCESS) {
3250 		DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()");
3251 		goto drv_init_fail_query_chan;
3252 	}
3253 
3254 	state->id_qpnum = ud_chan_attr.ud_qpn;
3255 	/* state->id_max_sqseg = ud_chan_attr.ud_chan_sizes.cs_sq_sgl; */
3256 
3257 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
3258 		state->id_max_sqseg = IBD_MAX_SQSEG;
3259 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
3260 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
3261 		    state->id_max_sqseg, IBD_MAX_SQSEG);
3262 	}
3263 
3264 	/* Initialize the Transmit buffer list */
3265 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
3266 		DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n");
3267 		goto drv_init_fail_txlist_init;
3268 	}
3269 
3270 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
3271 		/*
3272 		 * Setup the handler we will use for regular DLPI stuff
3273 		 */
3274 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
3275 		if (ibt_enable_cq_notify(state->id_scq_hdl,
3276 		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
3277 			DPRINT(10, "ibd_drv_init : failed in"
3278 			    " ibt_enable_cq_notify()\n");
3279 			goto drv_init_fail_cq_notify;
3280 		}
3281 	}
3282 
3283 	/* Initialize the Receive buffer list */
3284 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
3285 		DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n");
3286 		goto drv_init_fail_rxlist_init;
3287 	}
3288 
3289 	/* Join to IPoIB broadcast group as required by IPoIB */
3290 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
3291 		DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n");
3292 		goto drv_init_fail_join_group;
3293 	}
3294 
3295 	/*
3296 	 * Create the async thread; thread_create never fails.
3297 	 */
3298 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
3299 	    TS_RUN, minclsyspri);
3300 
3301 	state->id_async_thrid = kht->t_did;
3302 
3303 	/*
3304 	 * The local mac address is now known. Create the IPoIB
3305 	 * address.
3306 	 */
3307 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
3308 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3309 	/*
3310 	 * Similarly, program in the broadcast mac address.
3311 	 */
3312 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix,
3313 	    state->id_mgid.gid_guid);
3314 
3315 	ptr = (uint32_t *)&state->id_macaddr;
3316 	DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n",
3317 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3318 	ptr = (uint32_t *)&state->id_bcaddr;
3319 	DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n",
3320 	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
3321 	DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n",
3322 	    state->id_pkey, state->id_mgid.gid_prefix,
3323 	    state->id_mgid.gid_guid);
3324 	DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n",
3325 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
3326 	DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey);
3327 	DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu);
3328 	(void) ddi_pathname(state->id_dip, pathname);
3329 	DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname);
3330 
3331 	return (DDI_SUCCESS);
3332 
3333 drv_init_fail_join_group:
3334 	ibd_fini_rxlist(state);
3335 
3336 drv_init_fail_rxlist_init:
3337 drv_init_fail_cq_notify:
3338 	ibd_fini_txlist(state);
3339 
3340 drv_init_fail_txlist_init:
3341 drv_init_fail_query_chan:
3342 	if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS)
3343 		DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()");
3344 
3345 drv_init_fail_alloc_chan:
3346 	if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) !=
3347 	    IBT_SUCCESS))
3348 		DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()");
3349 
3350 	if (ibd_separate_cqs == 1)
3351 		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
3352 		    state->id_txwcs_size);
3353 
3354 drv_init_fail_alloc_scq:
3355 	if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS)
3356 		DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()");
3357 	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
3358 
3359 drv_init_fail_alloc_rcq:
3360 	ibd_acache_fini(state);
3361 drv_init_fail_acache:
3362 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
3363 		DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()");
3364 
3365 drv_init_fail_alloc_pd:
3366 	ibt_free_mcg_info(state->id_mcinfo, 1);
3367 drv_init_fail_find_bgroup:
3368 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
3369 		DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()");
3370 
3371 	return (DDI_FAILURE);
3372 }
3373 
3374 
3375 static int
3376 ibd_alloc_tx_copybufs(ibd_state_t *state)
3377 {
3378 	ibt_mr_attr_t mem_attr;
3379 
3380 	/*
3381 	 * Allocate one big chunk for all regular tx copy bufs
3382 	 */
3383 	state->id_tx_buf_sz = state->id_mtu;
3384 	if (state->id_lso_policy && state->id_lso_capable &&
3385 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3386 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3387 	}
3388 
3389 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3390 	    state->id_tx_buf_sz, KM_SLEEP);
3391 
3392 	/*
3393 	 * Do one memory registration on the entire txbuf area
3394 	 */
3395 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3396 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3397 	mem_attr.mr_as = NULL;
3398 	mem_attr.mr_flags = IBT_MR_SLEEP;
3399 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3400 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3401 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3402 		kmem_free(state->id_tx_bufs,
3403 		    state->id_num_swqe * state->id_tx_buf_sz);
3404 		state->id_tx_bufs = NULL;
3405 		return (DDI_FAILURE);
3406 	}
3407 
3408 	return (DDI_SUCCESS);
3409 }
3410 
3411 static int
3412 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3413 {
3414 	ibt_mr_attr_t mem_attr;
3415 	ibd_lsobuf_t *buflist;
3416 	ibd_lsobuf_t *lbufp;
3417 	ibd_lsobuf_t *tail;
3418 	ibd_lsobkt_t *bktp;
3419 	uint8_t *membase;
3420 	uint8_t *memp;
3421 	uint_t memsz;
3422 	int i;
3423 
3424 	/*
3425 	 * Allocate the lso bucket
3426 	 */
3427 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3428 
3429 	/*
3430 	 * Allocate the entire lso memory and register it
3431 	 */
3432 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3433 	membase = kmem_zalloc(memsz, KM_SLEEP);
3434 
3435 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3436 	mem_attr.mr_len = memsz;
3437 	mem_attr.mr_as = NULL;
3438 	mem_attr.mr_flags = IBT_MR_SLEEP;
3439 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3440 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3441 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3442 		kmem_free(membase, memsz);
3443 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3444 		return (DDI_FAILURE);
3445 	}
3446 
3447 	/*
3448 	 * Now allocate the buflist.  Note that the elements in the buflist and
3449 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3450 	 * can always derive the address of a buflist entry from the address of
3451 	 * an lso buffer.
3452 	 */
3453 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3454 	    KM_SLEEP);
3455 
3456 	/*
3457 	 * Set up the lso buf chain
3458 	 */
3459 	memp = membase;
3460 	lbufp = buflist;
3461 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3462 		lbufp->lb_isfree = 1;
3463 		lbufp->lb_buf = memp;
3464 		lbufp->lb_next = lbufp + 1;
3465 
3466 		tail = lbufp;
3467 
3468 		memp += IBD_LSO_BUFSZ;
3469 		lbufp++;
3470 	}
3471 	tail->lb_next = NULL;
3472 
3473 	/*
3474 	 * Set up the LSO buffer information in ibd state
3475 	 */
3476 	bktp->bkt_bufl = buflist;
3477 	bktp->bkt_free_head = buflist;
3478 	bktp->bkt_mem = membase;
3479 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3480 	bktp->bkt_nfree = bktp->bkt_nelem;
3481 
3482 	state->id_lso = bktp;
3483 
3484 	return (DDI_SUCCESS);
3485 }
3486 
3487 /*
3488  * Statically allocate Tx buffer list(s).
3489  */
3490 static int
3491 ibd_init_txlist(ibd_state_t *state)
3492 {
3493 	ibd_swqe_t *swqe;
3494 	ibt_lkey_t lkey;
3495 	int i;
3496 
3497 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3498 		return (DDI_FAILURE);
3499 
3500 	if (state->id_lso_policy && state->id_lso_capable) {
3501 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3502 			state->id_lso_policy = B_FALSE;
3503 	}
3504 
3505 	/*
3506 	 * Allocate and setup the swqe list
3507 	 */
3508 	lkey = state->id_tx_mr_desc.md_lkey;
3509 	for (i = 0; i < state->id_num_swqe; i++) {
3510 		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
3511 			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
3512 			ibd_fini_txlist(state);
3513 			return (DDI_FAILURE);
3514 		}
3515 
3516 		/* add to list */
3517 		state->id_tx_list.dl_cnt++;
3518 		if (state->id_tx_list.dl_head == NULL) {
3519 			swqe->swqe_prev = NULL;
3520 			swqe->swqe_next = NULL;
3521 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3522 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3523 		} else {
3524 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3525 			swqe->swqe_next = NULL;
3526 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3527 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3528 		}
3529 	}
3530 
3531 	return (DDI_SUCCESS);
3532 }
3533 
3534 static int
3535 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3536     uint32_t *nds_p)
3537 {
3538 	ibd_lsobkt_t *bktp;
3539 	ibd_lsobuf_t *lbufp;
3540 	ibd_lsobuf_t *nextp;
3541 	ibt_lkey_t lso_lkey;
3542 	uint_t frag_sz;
3543 	uint_t num_needed;
3544 	int i;
3545 
3546 	ASSERT(sgl_p != NULL);
3547 	ASSERT(nds_p != NULL);
3548 	ASSERT(req_sz != 0);
3549 
3550 	/*
3551 	 * Determine how many bufs we'd need for the size requested
3552 	 */
3553 	num_needed = req_sz / IBD_LSO_BUFSZ;
3554 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3555 		num_needed++;
3556 
3557 	mutex_enter(&state->id_lso_lock);
3558 
3559 	/*
3560 	 * If we don't have enough lso bufs, return failure
3561 	 */
3562 	ASSERT(state->id_lso != NULL);
3563 	bktp = state->id_lso;
3564 	if (bktp->bkt_nfree < num_needed) {
3565 		mutex_exit(&state->id_lso_lock);
3566 		return (-1);
3567 	}
3568 
3569 	/*
3570 	 * Pick the first 'num_needed' bufs from the free list
3571 	 */
3572 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3573 	lbufp = bktp->bkt_free_head;
3574 	for (i = 0; i < num_needed; i++) {
3575 		ASSERT(lbufp->lb_isfree != 0);
3576 		ASSERT(lbufp->lb_buf != NULL);
3577 
3578 		nextp = lbufp->lb_next;
3579 
3580 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3581 		sgl_p[i].ds_key = lso_lkey;
3582 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3583 
3584 		lbufp->lb_isfree = 0;
3585 		lbufp->lb_next = NULL;
3586 
3587 		lbufp = nextp;
3588 	}
3589 	bktp->bkt_free_head = lbufp;
3590 
3591 	/*
3592 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3593 	 * to adjust the last sgl entry's length. Since we know we need atleast
3594 	 * one, the i-1 use below is ok.
3595 	 */
3596 	if (frag_sz) {
3597 		sgl_p[i-1].ds_len = frag_sz;
3598 	}
3599 
3600 	/*
3601 	 * Update nfree count and return
3602 	 */
3603 	bktp->bkt_nfree -= num_needed;
3604 
3605 	mutex_exit(&state->id_lso_lock);
3606 
3607 	*nds_p = num_needed;
3608 
3609 	return (0);
3610 }
3611 
3612 static void
3613 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3614 {
3615 	ibd_lsobkt_t *bktp;
3616 	ibd_lsobuf_t *lbufp;
3617 	uint8_t *lso_mem_end;
3618 	uint_t ndx;
3619 	int i;
3620 
3621 	mutex_enter(&state->id_lso_lock);
3622 
3623 	bktp = state->id_lso;
3624 	ASSERT(bktp != NULL);
3625 
3626 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3627 	for (i = 0; i < nds; i++) {
3628 		uint8_t *va;
3629 
3630 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3631 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3632 
3633 		/*
3634 		 * Figure out the buflist element this sgl buffer corresponds
3635 		 * to and put it back at the head
3636 		 */
3637 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3638 		lbufp = bktp->bkt_bufl + ndx;
3639 
3640 		ASSERT(lbufp->lb_isfree == 0);
3641 		ASSERT(lbufp->lb_buf == va);
3642 
3643 		lbufp->lb_isfree = 1;
3644 		lbufp->lb_next = bktp->bkt_free_head;
3645 		bktp->bkt_free_head = lbufp;
3646 	}
3647 	bktp->bkt_nfree += nds;
3648 
3649 	mutex_exit(&state->id_lso_lock);
3650 }
3651 
3652 static void
3653 ibd_free_tx_copybufs(ibd_state_t *state)
3654 {
3655 	/*
3656 	 * Unregister txbuf mr
3657 	 */
3658 	if (ibt_deregister_mr(state->id_hca_hdl,
3659 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3660 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3661 	}
3662 	state->id_tx_mr_hdl = NULL;
3663 
3664 	/*
3665 	 * Free txbuf memory
3666 	 */
3667 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3668 	state->id_tx_bufs = NULL;
3669 }
3670 
3671 static void
3672 ibd_free_tx_lsobufs(ibd_state_t *state)
3673 {
3674 	ibd_lsobkt_t *bktp;
3675 
3676 	mutex_enter(&state->id_lso_lock);
3677 
3678 	if ((bktp = state->id_lso) == NULL) {
3679 		mutex_exit(&state->id_lso_lock);
3680 		return;
3681 	}
3682 
3683 	/*
3684 	 * First, free the buflist
3685 	 */
3686 	ASSERT(bktp->bkt_bufl != NULL);
3687 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3688 
3689 	/*
3690 	 * Unregister the LSO memory and free it
3691 	 */
3692 	ASSERT(bktp->bkt_mr_hdl != NULL);
3693 	if (ibt_deregister_mr(state->id_hca_hdl,
3694 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3695 		DPRINT(10,
3696 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3697 	}
3698 	ASSERT(bktp->bkt_mem);
3699 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3700 
3701 	/*
3702 	 * Finally free the bucket
3703 	 */
3704 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3705 	state->id_lso = NULL;
3706 
3707 	mutex_exit(&state->id_lso_lock);
3708 }
3709 
3710 /*
3711  * Free the statically allocated Tx buffer list.
3712  */
3713 static void
3714 ibd_fini_txlist(ibd_state_t *state)
3715 {
3716 	ibd_swqe_t *node;
3717 
3718 	/*
3719 	 * Free the allocated swqes
3720 	 */
3721 	mutex_enter(&state->id_tx_list.dl_mutex);
3722 	while (state->id_tx_list.dl_head != NULL) {
3723 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3724 		state->id_tx_list.dl_head = node->swqe_next;
3725 		state->id_tx_list.dl_cnt--;
3726 		ASSERT(state->id_tx_list.dl_cnt >= 0);
3727 		ibd_free_swqe(state, node);
3728 	}
3729 	mutex_exit(&state->id_tx_list.dl_mutex);
3730 
3731 	ibd_free_tx_lsobufs(state);
3732 	ibd_free_tx_copybufs(state);
3733 }
3734 
3735 /*
3736  * Allocate a single send wqe and register it so it is almost
3737  * ready to be posted to the hardware.
3738  */
3739 static int
3740 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
3741 {
3742 	ibd_swqe_t *swqe;
3743 
3744 	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
3745 	*wqe = swqe;
3746 
3747 	swqe->swqe_type = IBD_WQE_SEND;
3748 	swqe->swqe_next = NULL;
3749 	swqe->swqe_prev = NULL;
3750 	swqe->swqe_im_mblk = NULL;
3751 
3752 	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3753 	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
3754 	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3755 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3756 
3757 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3758 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3759 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3760 
3761 	/* These are set in send */
3762 	swqe->w_swr.wr_nds = 0;
3763 	swqe->w_swr.wr_sgl = NULL;
3764 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3765 
3766 	return (DDI_SUCCESS);
3767 }
3768 
3769 /*
3770  * Free an allocated send wqe.
3771  */
3772 /*ARGSUSED*/
3773 static void
3774 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3775 {
3776 	kmem_free(swqe, sizeof (ibd_swqe_t));
3777 }
3778 
3779 /*
3780  * Post a rwqe to the hardware and add it to the Rx list. The
3781  * "recycle" parameter indicates whether an old rwqe is being
3782  * recycled, or this is a new one.
3783  */
3784 static int
3785 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3786 {
3787 	ibt_status_t ibt_status;
3788 
3789 	if (recycle == B_FALSE) {
3790 		mutex_enter(&state->id_rx_list.dl_mutex);
3791 		if (state->id_rx_list.dl_head == NULL) {
3792 			rwqe->rwqe_prev = NULL;
3793 			rwqe->rwqe_next = NULL;
3794 			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3795 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3796 		} else {
3797 			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3798 			rwqe->rwqe_next = NULL;
3799 			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3800 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3801 		}
3802 		mutex_exit(&state->id_rx_list.dl_mutex);
3803 	}
3804 
3805 	mutex_enter(&state->id_rxpost_lock);
3806 	if (state->id_rx_busy) {
3807 		rwqe->w_post_link = NULL;
3808 		if (state->id_rx_head)
3809 			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
3810 		else
3811 			state->id_rx_head = rwqe;
3812 		state->id_rx_tailp = &(rwqe->w_post_link);
3813 	} else {
3814 		state->id_rx_busy = 1;
3815 		do {
3816 			mutex_exit(&state->id_rxpost_lock);
3817 
3818 			/*
3819 			 * Here we should add dl_cnt before post recv, because
3820 			 * we would have to make sure dl_cnt is updated before
3821 			 * the corresponding ibd_process_rx() is called.
3822 			 */
3823 			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3824 
3825 			ibt_status = ibt_post_recv(state->id_chnl_hdl,
3826 			    &rwqe->w_rwr, 1, NULL);
3827 			if (ibt_status != IBT_SUCCESS) {
3828 				(void) atomic_add_32_nv(
3829 				    &state->id_rx_list.dl_cnt, -1);
3830 				ibd_print_warn(state, "ibd_post_rwqe: "
3831 				    "posting failed, ret=%d", ibt_status);
3832 				return (DDI_FAILURE);
3833 			}
3834 
3835 			mutex_enter(&state->id_rxpost_lock);
3836 			rwqe = state->id_rx_head;
3837 			if (rwqe) {
3838 				state->id_rx_head =
3839 				    (ibd_rwqe_t *)(rwqe->w_post_link);
3840 			}
3841 		} while (rwqe);
3842 		state->id_rx_busy = 0;
3843 	}
3844 	mutex_exit(&state->id_rxpost_lock);
3845 
3846 	return (DDI_SUCCESS);
3847 }
3848 
3849 /*
3850  * Allocate the statically allocated Rx buffer list.
3851  */
3852 static int
3853 ibd_init_rxlist(ibd_state_t *state)
3854 {
3855 	ibd_rwqe_t *rwqe;
3856 	int i;
3857 
3858 	for (i = 0; i < state->id_num_rwqe; i++) {
3859 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3860 			ibd_fini_rxlist(state);
3861 			return (DDI_FAILURE);
3862 		}
3863 
3864 		if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) {
3865 			ibd_free_rwqe(state, rwqe);
3866 			ibd_fini_rxlist(state);
3867 			return (DDI_FAILURE);
3868 		}
3869 	}
3870 
3871 	return (DDI_SUCCESS);
3872 }
3873 
3874 /*
3875  * Free the statically allocated Rx buffer list.
3876  *
3877  */
3878 static void
3879 ibd_fini_rxlist(ibd_state_t *state)
3880 {
3881 	ibd_rwqe_t *node;
3882 
3883 	mutex_enter(&state->id_rx_list.dl_mutex);
3884 	while (state->id_rx_list.dl_head != NULL) {
3885 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3886 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3887 		state->id_rx_list.dl_cnt--;
3888 		ASSERT(state->id_rx_list.dl_cnt >= 0);
3889 
3890 		ibd_free_rwqe(state, node);
3891 	}
3892 	mutex_exit(&state->id_rx_list.dl_mutex);
3893 }
3894 
3895 /*
3896  * Allocate a single recv wqe and register it so it is almost
3897  * ready to be posted to the hardware.
3898  */
3899 static int
3900 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3901 {
3902 	ibt_mr_attr_t mem_attr;
3903 	ibd_rwqe_t *rwqe;
3904 
3905 	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3906 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3907 		return (DDI_FAILURE);
3908 	}
3909 	*wqe = rwqe;
3910 	rwqe->rwqe_type = IBD_WQE_RECV;
3911 	rwqe->w_state = state;
3912 	rwqe->rwqe_next = NULL;
3913 	rwqe->rwqe_prev = NULL;
3914 	rwqe->w_freeing_wqe = B_FALSE;
3915 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3916 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3917 
3918 	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3919 	    IPOIB_GRH_SIZE, KM_NOSLEEP);
3920 	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
3921 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3922 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3923 		return (DDI_FAILURE);
3924 	}
3925 
3926 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3927 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3928 	    NULL) {
3929 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3930 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3931 		    state->id_mtu + IPOIB_GRH_SIZE);
3932 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3933 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3934 		return (DDI_FAILURE);
3935 	}
3936 
3937 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3938 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3939 	mem_attr.mr_as = NULL;
3940 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3941 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3942 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3943 	    IBT_SUCCESS) {
3944 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3945 		rwqe->w_freeing_wqe = B_TRUE;
3946 		freemsg(rwqe->rwqe_im_mblk);
3947 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3948 		    state->id_mtu + IPOIB_GRH_SIZE);
3949 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3950 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3951 		return (DDI_FAILURE);
3952 	}
3953 
3954 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3955 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3956 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3957 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3958 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3959 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3960 	rwqe->w_rwr.wr_nds = 1;
3961 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3962 
3963 	return (DDI_SUCCESS);
3964 }
3965 
3966 /*
3967  * Free an allocated recv wqe.
3968  */
3969 static void
3970 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3971 {
3972 	if (ibt_deregister_mr(state->id_hca_hdl,
3973 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3974 		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
3975 		return;
3976 	}
3977 
3978 	/*
3979 	 * Indicate to the callback function that this rwqe/mblk
3980 	 * should not be recycled. The freemsg() will invoke
3981 	 * ibd_freemsg_cb().
3982 	 */
3983 	if (rwqe->rwqe_im_mblk != NULL) {
3984 		rwqe->w_freeing_wqe = B_TRUE;
3985 		freemsg(rwqe->rwqe_im_mblk);
3986 	}
3987 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3988 	    state->id_mtu + IPOIB_GRH_SIZE);
3989 	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3990 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3991 }
3992 
3993 /*
3994  * Delete the rwqe being freed from the rx list.
3995  */
3996 static void
3997 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3998 {
3999 	mutex_enter(&state->id_rx_list.dl_mutex);
4000 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
4001 		state->id_rx_list.dl_head = rwqe->rwqe_next;
4002 	else
4003 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
4004 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
4005 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
4006 	else
4007 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
4008 	mutex_exit(&state->id_rx_list.dl_mutex);
4009 }
4010 
4011 /*
4012  * Pre ibt_detach() deconstruction.
4013  */
4014 static void
4015 ibd_drv_fini(ibd_state_t *state)
4016 {
4017 	ib_gid_t mgid;
4018 	ibd_mce_t *mce;
4019 	ibt_status_t status;
4020 	uint8_t jstate;
4021 
4022 	/*
4023 	 * Desubscribe from trap notices; we will be tearing down
4024 	 * the mcg lists soon. Make sure the trap handler does nothing
4025 	 * even if it is invoked (ie till we invoke ibt_detach()).
4026 	 */
4027 	ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4028 	mutex_enter(&state->id_trap_lock);
4029 	state->id_trap_stop = B_TRUE;
4030 	while (state->id_trap_inprog > 0)
4031 		cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4032 	mutex_exit(&state->id_trap_lock);
4033 
4034 	/*
4035 	 * Flushing the channel ensures that all pending WQE's
4036 	 * are marked with flush_error and handed to the CQ. It
4037 	 * does not guarantee the invocation of the CQ handler.
4038 	 * This call is guaranteed to return successfully for UD QPNs.
4039 	 */
4040 	status = ibt_flush_channel(state->id_chnl_hdl);
4041 	ASSERT(status == IBT_SUCCESS);
4042 
4043 	/*
4044 	 * We possibly need a loop here to wait for all the Tx
4045 	 * callbacks to happen. The Tx handlers will retrieve
4046 	 * held resources like AH ac_ref count, registered memory
4047 	 * and possibly IBD_ASYNC_REAP requests. Rx interrupts were already
4048 	 * turned off (in ibd_detach()); turn off Tx interrupts and
4049 	 * poll. By the time the polling returns an empty indicator,
4050 	 * we are sure we have seen all pending Tx callbacks. Note
4051 	 * that after the ibt_set_cq_handler() returns, the old handler
4052 	 * is guaranteed not to be invoked anymore.
4053 	 */
4054 	if (ibd_separate_cqs == 1)
4055 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4056 	ibd_poll_compq(state, state->id_scq_hdl);
4057 
4058 	/*
4059 	 * No more async requests will be posted since the device has been
4060 	 * unregistered; completion handlers have been turned off, so Tx
4061 	 * handler will not cause any more IBD_ASYNC_REAP requests. Queue a
4062 	 * request for the async thread to exit, which will be serviced
4063 	 * after any pending ones. This can take a while, specially if the
4064 	 * SM is unreachable, since IBMF will slowly timeout each SM request
4065 	 * issued by the async thread. Reap the thread before continuing on,
4066 	 * we do not want it to be lingering in modunloaded code.
4067 	 */
4068 	ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4069 	thread_join(state->id_async_thrid);
4070 
4071 	/*
4072 	 * We can not be in promiscuous mode anymore, upper layers
4073 	 * would have made a request to disable it (if ever set previously)
4074 	 * before the detach is allowed to progress to this point; and the
4075 	 * aysnc thread would have processed that request by now. Thus the
4076 	 * nonmember list is guaranteed empty at this point.
4077 	 */
4078 	ASSERT(state->id_prom_op != IBD_OP_COMPLETED);
4079 
4080 	/*
4081 	 * Drop all residual full/non membership. This includes full
4082 	 * membership to the broadcast group, and any nonmembership
4083 	 * acquired during transmits. We do this after the Tx completion
4084 	 * handlers are done, since those might result in some late
4085 	 * leaves; this also eliminates a potential race with that
4086 	 * path wrt the mc full list insert/delete. Trap handling
4087 	 * has also been suppressed at this point. Thus, no locks
4088 	 * are required while traversing the mc full list.
4089 	 */
4090 	DPRINT(2, "ibd_drv_fini : clear full cache entries");
4091 	mce = list_head(&state->id_mc_full);
4092 	while (mce != NULL) {
4093 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4094 		jstate = mce->mc_jstate;
4095 		mce = list_next(&state->id_mc_full, mce);
4096 		ibd_leave_group(state, mgid, jstate);
4097 	}
4098 
4099 	ibt_free_mcg_info(state->id_mcinfo, 1);
4100 
4101 	/*
4102 	 * Kill the channel now; guaranteed to return successfully
4103 	 * for UD QPNs.
4104 	 */
4105 	status = ibt_free_channel(state->id_chnl_hdl);
4106 	ASSERT(status == IBT_SUCCESS);
4107 
4108 	/*
4109 	 * Kill the CQ; all completion handlers are guaranteed to
4110 	 * have terminated by the time this returns. Since we killed
4111 	 * the QPN above, we can not receive the IBT_CQ_BUSY error.
4112 	 */
4113 	status = ibt_free_cq(state->id_rcq_hdl);
4114 	ASSERT(status == IBT_SUCCESS);
4115 	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
4116 
4117 	if (ibd_separate_cqs == 1) {
4118 		status = ibt_free_cq(state->id_scq_hdl);
4119 		ASSERT(status == IBT_SUCCESS);
4120 		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
4121 		    state->id_txwcs_size);
4122 	}
4123 
4124 	/*
4125 	 * Since these following will act on the Rx/Tx list, which
4126 	 * is also looked at by the Rx/Tx handlers, keep them around
4127 	 * till all handlers are guaranteed to have completed.
4128 	 */
4129 	ibd_fini_rxlist(state);
4130 	ibd_fini_txlist(state);
4131 
4132 	/*
4133 	 * Clean up the active AH hash list.
4134 	 */
4135 	mod_hash_destroy_hash(state->id_ah_active_hash);
4136 
4137 	/*
4138 	 * Free parallel ARP cache and AHs; we are sure all of these
4139 	 * resources have been released by the Tx completion handler.
4140 	 */
4141 	ibd_acache_fini(state);
4142 
4143 	/*
4144 	 * We freed the QPN, all the MRs and AHs. This step should not
4145 	 * fail; print a warning message if it does fail, due to a bug
4146 	 * in the driver.
4147 	 */
4148 	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
4149 		ibd_print_warn(state, "failed to free protection domain");
4150 
4151 	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
4152 		ibd_print_warn(state, "failed to close HCA device");
4153 }
4154 
4155 /*
4156  * IBA Rx/Tx completion queue handler. Guaranteed to be single
4157  * threaded and nonreentrant for this CQ. When using combined CQ,
4158  * this handles Tx and Rx completions. With separate CQs, this handles
4159  * only Rx completions.
4160  */
4161 /* ARGSUSED */
4162 static void
4163 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4164 {
4165 	ibd_state_t *state = (ibd_state_t *)arg;
4166 
4167 	atomic_add_64(&state->id_num_intrs, 1);
4168 
4169 	if (ibd_rx_softintr == 1)
4170 		ddi_trigger_softintr(state->id_rx);
4171 	else
4172 		(void) ibd_intr((char *)state);
4173 }
4174 
4175 /*
4176  * Separate CQ handler for Tx completions, when the Tx CQ is in
4177  * interrupt driven mode.
4178  */
4179 /* ARGSUSED */
4180 static void
4181 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4182 {
4183 	ibd_state_t *state = (ibd_state_t *)arg;
4184 
4185 	atomic_add_64(&state->id_num_intrs, 1);
4186 
4187 	if (ibd_tx_softintr == 1)
4188 		ddi_trigger_softintr(state->id_tx);
4189 	else
4190 		(void) ibd_tx_recycle((char *)state);
4191 }
4192 
4193 /*
4194  * Multicast group create/delete trap handler. These will be delivered
4195  * on a kernel thread (handling can thus block) and can be invoked
4196  * concurrently. The handler can be invoked anytime after it is
4197  * registered and before ibt_detach().
4198  */
4199 /* ARGSUSED */
4200 static void
4201 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4202     ibt_subnet_event_t *event)
4203 {
4204 	ibd_state_t *state = (ibd_state_t *)arg;
4205 	ibd_req_t *req;
4206 
4207 	/*
4208 	 * The trap handler will get invoked once for every event for
4209 	 * evert port. The input "gid" is the GID0 of the port the
4210 	 * trap came in on; we just need to act on traps that came
4211 	 * to our port, meaning the port on which the ipoib interface
4212 	 * resides. Since ipoib uses GID0 of the port, we just match
4213 	 * the gids to check whether we need to handle the trap.
4214 	 */
4215 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4216 		return;
4217 
4218 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4219 
4220 	switch (code) {
4221 		case IBT_SM_EVENT_UNAVAILABLE:
4222 			/*
4223 			 * If we are in promiscuous mode or have
4224 			 * sendnonmembers, we need to print a warning
4225 			 * message right now. Else, just store the
4226 			 * information, print when we enter promiscuous
4227 			 * mode or attempt nonmember send. We might
4228 			 * also want to stop caching sendnonmember.
4229 			 */
4230 			ibd_print_warn(state, "IBA multicast support "
4231 			    "degraded due to unavailability of multicast "
4232 			    "traps");
4233 			break;
4234 		case IBT_SM_EVENT_AVAILABLE:
4235 			/*
4236 			 * If we printed a warning message above or
4237 			 * while trying to nonmember send or get into
4238 			 * promiscuous mode, print an okay message.
4239 			 */
4240 			ibd_print_warn(state, "IBA multicast support "
4241 			    "restored due to availability of multicast "
4242 			    "traps");
4243 			break;
4244 		case IBT_SM_EVENT_MCG_CREATED:
4245 		case IBT_SM_EVENT_MCG_DELETED:
4246 			/*
4247 			 * Common processing of creation/deletion traps.
4248 			 * First check if the instance is being
4249 			 * [de]initialized; back off then, without doing
4250 			 * anything more, since we are not sure if the
4251 			 * async thread is around, or whether we might
4252 			 * be racing with the detach code in ibd_drv_fini()
4253 			 * that scans the mcg list.
4254 			 */
4255 			if (!ibd_async_safe(state))
4256 				return;
4257 
4258 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4259 			req->rq_gid = event->sm_notice_gid;
4260 			req->rq_ptr = (void *)code;
4261 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4262 			break;
4263 	}
4264 }
4265 
4266 static void
4267 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4268 {
4269 	ib_gid_t mgid = req->rq_gid;
4270 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4271 
4272 	DPRINT(10, "ibd_async_trap : %d\n", code);
4273 
4274 	/*
4275 	 * Atomically search the nonmember and sendonlymember lists and
4276 	 * delete.
4277 	 */
4278 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4279 
4280 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4281 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4282 
4283 		/*
4284 		 * If in promiscuous mode, try to join/attach to the new
4285 		 * mcg. Given the unreliable out-of-order mode of trap
4286 		 * delivery, we can never be sure whether it is a problem
4287 		 * if the join fails. Thus, we warn the admin of a failure
4288 		 * if this was a creation trap. Note that the trap might
4289 		 * actually be reporting a long past event, and the mcg
4290 		 * might already have been deleted, thus we might be warning
4291 		 * in vain.
4292 		 */
4293 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4294 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4295 			ibd_print_warn(state, "IBA promiscuous mode missed "
4296 			    "new multicast gid %016llx:%016llx",
4297 			    (u_longlong_t)mgid.gid_prefix,
4298 			    (u_longlong_t)mgid.gid_guid);
4299 	}
4300 
4301 	/*
4302 	 * Free the request slot allocated by the subnet event thread.
4303 	 */
4304 	ibd_async_done(state);
4305 }
4306 
4307 /*
4308  * GLDv3 entry point to get capabilities.
4309  */
4310 static boolean_t
4311 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4312 {
4313 	ibd_state_t *state = arg;
4314 
4315 	switch (cap) {
4316 	case MAC_CAPAB_HCKSUM: {
4317 		uint32_t *txflags = cap_data;
4318 
4319 		/*
4320 		 * We either do full checksum or not do it at all
4321 		 */
4322 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4323 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4324 		else
4325 			return (B_FALSE);
4326 		break;
4327 	}
4328 
4329 	case MAC_CAPAB_LSO: {
4330 		mac_capab_lso_t *cap_lso = cap_data;
4331 
4332 		/*
4333 		 * In addition to the capability and policy, since LSO
4334 		 * relies on hw checksum, we'll not enable LSO if we
4335 		 * don't have hw checksum.  Of course, if the HCA doesn't
4336 		 * provide the reserved lkey capability, enabling LSO will
4337 		 * actually affect performance adversely, so we'll disable
4338 		 * LSO even for that case.
4339 		 */
4340 		if (!state->id_lso_policy || !state->id_lso_capable)
4341 			return (B_FALSE);
4342 
4343 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4344 			return (B_FALSE);
4345 
4346 		if (state->id_hca_res_lkey_capab == 0) {
4347 			ibd_print_warn(state, "no reserved-lkey capability, "
4348 			    "disabling LSO");
4349 			return (B_FALSE);
4350 		}
4351 
4352 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4353 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4354 		break;
4355 	}
4356 
4357 	default:
4358 		return (B_FALSE);
4359 	}
4360 
4361 	return (B_TRUE);
4362 }
4363 
4364 /*
4365  * GLDv3 entry point to start hardware.
4366  */
4367 /*ARGSUSED*/
4368 static int
4369 ibd_m_start(void *arg)
4370 {
4371 	return (0);
4372 }
4373 
4374 /*
4375  * GLDv3 entry point to stop hardware from receiving packets.
4376  */
4377 /*ARGSUSED*/
4378 static void
4379 ibd_m_stop(void *arg)
4380 {
4381 }
4382 
4383 /*
4384  * GLDv3 entry point to modify device's mac address. We do not
4385  * allow address modifications.
4386  */
4387 static int
4388 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4389 {
4390 	ibd_state_t *state;
4391 
4392 	state = (ibd_state_t *)arg;
4393 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4394 		return (0);
4395 	else
4396 		return (EINVAL);
4397 }
4398 
4399 /*
4400  * The blocking part of the IBA join/leave operations are done out
4401  * of here on the async thread.
4402  */
4403 static void
4404 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4405 {
4406 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4407 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4408 
4409 	if (op == IBD_ASYNC_JOIN) {
4410 
4411 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4412 			ibd_print_warn(state, "Joint multicast group failed :"
4413 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4414 		}
4415 	} else {
4416 		/*
4417 		 * Here, we must search for the proper mcg_info and
4418 		 * use that to leave the group.
4419 		 */
4420 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4421 	}
4422 }
4423 
4424 /*
4425  * GLDv3 entry point for multicast enable/disable requests.
4426  * This function queues the operation to the async thread and
4427  * return success for a valid multicast address.
4428  */
4429 static int
4430 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4431 {
4432 	ibd_state_t *state = (ibd_state_t *)arg;
4433 	ipoib_mac_t maddr, *mcast;
4434 	ib_gid_t mgid;
4435 	ibd_req_t *req;
4436 
4437 	/*
4438 	 * The incoming multicast address might not be aligned properly
4439 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4440 	 * it to look like one though, to get the offsets of the mc gid,
4441 	 * since we know we are not going to dereference any values with
4442 	 * the ipoib_mac_t pointer.
4443 	 */
4444 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4445 	mcast = &maddr;
4446 
4447 	/*
4448 	 * Check validity of MCG address. We could additionally check
4449 	 * that a enable/disable is not being issued on the "broadcast"
4450 	 * mcg, but since this operation is only invokable by priviledged
4451 	 * programs anyway, we allow the flexibility to those dlpi apps.
4452 	 * Note that we do not validate the "scope" of the IBA mcg.
4453 	 */
4454 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4455 		return (EINVAL);
4456 
4457 	/*
4458 	 * fill in multicast pkey and scope
4459 	 */
4460 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4461 
4462 	/*
4463 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4464 	 * nothing (ie we stay JOINed to the broadcast group done in
4465 	 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
4466 	 * requires to be joined to broadcast groups at all times.
4467 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4468 	 * depends on this.
4469 	 */
4470 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4471 		return (0);
4472 
4473 	ibd_n2h_gid(mcast, &mgid);
4474 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4475 	if (req == NULL)
4476 		return (ENOMEM);
4477 
4478 	req->rq_gid = mgid;
4479 
4480 	if (add) {
4481 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4482 		    mgid.gid_prefix, mgid.gid_guid);
4483 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4484 	} else {
4485 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4486 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4487 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4488 	}
4489 	return (0);
4490 }
4491 
4492 /*
4493  * The blocking part of the IBA promiscuous operations are done
4494  * out of here on the async thread. The dlpireq parameter indicates
4495  * whether this invocation is due to a dlpi request or due to
4496  * a port up/down event.
4497  */
4498 static void
4499 ibd_async_unsetprom(ibd_state_t *state)
4500 {
4501 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4502 	ib_gid_t mgid;
4503 
4504 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4505 
4506 	while (mce != NULL) {
4507 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4508 		mce = list_next(&state->id_mc_non, mce);
4509 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4510 	}
4511 	state->id_prom_op = IBD_OP_NOTSTARTED;
4512 }
4513 
4514 /*
4515  * The blocking part of the IBA promiscuous operations are done
4516  * out of here on the async thread. The dlpireq parameter indicates
4517  * whether this invocation is due to a dlpi request or due to
4518  * a port up/down event.
4519  */
4520 static void
4521 ibd_async_setprom(ibd_state_t *state)
4522 {
4523 	ibt_mcg_attr_t mcg_attr;
4524 	ibt_mcg_info_t *mcg_info;
4525 	ib_gid_t mgid;
4526 	uint_t numg;
4527 	int i, ret = IBD_OP_COMPLETED;
4528 
4529 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4530 
4531 	/*
4532 	 * Obtain all active MC groups on the IB fabric with
4533 	 * specified criteria (scope + Pkey + Qkey + mtu).
4534 	 */
4535 	bzero(&mcg_attr, sizeof (mcg_attr));
4536 	mcg_attr.mc_pkey = state->id_pkey;
4537 	mcg_attr.mc_scope = state->id_scope;
4538 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4539 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4540 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4541 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4542 	    IBT_SUCCESS) {
4543 		ibd_print_warn(state, "Could not get list of IBA multicast "
4544 		    "groups");
4545 		ret = IBD_OP_ERRORED;
4546 		goto done;
4547 	}
4548 
4549 	/*
4550 	 * Iterate over the returned mcg's and join as NonMember
4551 	 * to the IP mcg's.
4552 	 */
4553 	for (i = 0; i < numg; i++) {
4554 		/*
4555 		 * Do a NonMember JOIN on the MC group.
4556 		 */
4557 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4558 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4559 			ibd_print_warn(state, "IBA promiscuous mode missed "
4560 			    "multicast gid %016llx:%016llx",
4561 			    (u_longlong_t)mgid.gid_prefix,
4562 			    (u_longlong_t)mgid.gid_guid);
4563 	}
4564 
4565 	ibt_free_mcg_info(mcg_info, numg);
4566 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4567 done:
4568 	state->id_prom_op = ret;
4569 }
4570 
4571 /*
4572  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4573  * GLDv3 assumes phys state receives more packets than multi state,
4574  * which is not true for IPoIB. Thus, treat the multi and phys
4575  * promiscuous states the same way to work with GLDv3's assumption.
4576  */
4577 static int
4578 ibd_m_promisc(void *arg, boolean_t on)
4579 {
4580 	ibd_state_t *state = (ibd_state_t *)arg;
4581 	ibd_req_t *req;
4582 
4583 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4584 	if (req == NULL)
4585 		return (ENOMEM);
4586 	if (on) {
4587 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4588 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
4589 	} else {
4590 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4591 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
4592 	}
4593 
4594 	return (0);
4595 }
4596 
4597 /*
4598  * GLDv3 entry point for gathering statistics.
4599  */
4600 static int
4601 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
4602 {
4603 	ibd_state_t *state = (ibd_state_t *)arg;
4604 
4605 	switch (stat) {
4606 	case MAC_STAT_IFSPEED:
4607 		*val = state->id_link_speed;
4608 		break;
4609 	case MAC_STAT_MULTIRCV:
4610 		*val = state->id_multi_rcv;
4611 		break;
4612 	case MAC_STAT_BRDCSTRCV:
4613 		*val = state->id_brd_rcv;
4614 		break;
4615 	case MAC_STAT_MULTIXMT:
4616 		*val = state->id_multi_xmt;
4617 		break;
4618 	case MAC_STAT_BRDCSTXMT:
4619 		*val = state->id_brd_xmt;
4620 		break;
4621 	case MAC_STAT_RBYTES:
4622 		*val = state->id_rcv_bytes;
4623 		break;
4624 	case MAC_STAT_IPACKETS:
4625 		*val = state->id_rcv_pkt;
4626 		break;
4627 	case MAC_STAT_OBYTES:
4628 		*val = state->id_xmt_bytes;
4629 		break;
4630 	case MAC_STAT_OPACKETS:
4631 		*val = state->id_xmt_pkt;
4632 		break;
4633 	case MAC_STAT_OERRORS:
4634 		*val = state->id_ah_error;	/* failed AH translation */
4635 		break;
4636 	case MAC_STAT_IERRORS:
4637 		*val = 0;
4638 		break;
4639 	case MAC_STAT_NOXMTBUF:
4640 		*val = state->id_tx_short;
4641 		break;
4642 	case MAC_STAT_NORCVBUF:
4643 	default:
4644 		return (ENOTSUP);
4645 	}
4646 
4647 	return (0);
4648 }
4649 
4650 static void
4651 ibd_async_txsched(ibd_state_t *state)
4652 {
4653 	ibd_req_t *req;
4654 	int ret;
4655 
4656 	if (ibd_txcomp_poll)
4657 		ibd_poll_compq(state, state->id_scq_hdl);
4658 
4659 	ret = ibd_resume_transmission(state);
4660 	if (ret && ibd_txcomp_poll) {
4661 		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
4662 			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
4663 		else {
4664 			ibd_print_warn(state, "ibd_async_txsched: "
4665 			    "no memory, can't schedule work slot");
4666 		}
4667 	}
4668 }
4669 
4670 static int
4671 ibd_resume_transmission(ibd_state_t *state)
4672 {
4673 	int flag;
4674 	int met_thresh = 0;
4675 	int ret = -1;
4676 
4677 	mutex_enter(&state->id_sched_lock);
4678 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
4679 		met_thresh = (state->id_tx_list.dl_cnt >
4680 		    IBD_FREE_SWQES_THRESH);
4681 		flag = IBD_RSRC_SWQE;
4682 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
4683 		ASSERT(state->id_lso != NULL);
4684 		met_thresh = (state->id_lso->bkt_nfree >
4685 		    IBD_FREE_LSOS_THRESH);
4686 		flag = IBD_RSRC_LSOBUF;
4687 	}
4688 	if (met_thresh) {
4689 		state->id_sched_needed &= ~flag;
4690 		ret = 0;
4691 	}
4692 	mutex_exit(&state->id_sched_lock);
4693 
4694 	if (ret == 0)
4695 		mac_tx_update(state->id_mh);
4696 
4697 	return (ret);
4698 }
4699 
4700 /*
4701  * Release the send wqe back into free list.
4702  */
4703 static void
4704 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
4705 {
4706 	/*
4707 	 * Add back on Tx list for reuse.
4708 	 */
4709 	swqe->swqe_next = NULL;
4710 	mutex_enter(&state->id_tx_list.dl_mutex);
4711 	if (state->id_tx_list.dl_pending_sends) {
4712 		state->id_tx_list.dl_pending_sends = B_FALSE;
4713 	}
4714 	if (state->id_tx_list.dl_head == NULL) {
4715 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
4716 	} else {
4717 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
4718 	}
4719 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
4720 	state->id_tx_list.dl_cnt++;
4721 	mutex_exit(&state->id_tx_list.dl_mutex);
4722 }
4723 
4724 /*
4725  * Acquire a send wqe from free list.
4726  * Returns error number and send wqe pointer.
4727  */
4728 static int
4729 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
4730 {
4731 	int rc = 0;
4732 	ibd_swqe_t *wqe;
4733 
4734 	/*
4735 	 * Check and reclaim some of the completed Tx requests.
4736 	 * If someone else is already in this code and pulling Tx
4737 	 * completions, no need to poll, since the current lock holder
4738 	 * will do the work anyway. Normally, we poll for completions
4739 	 * every few Tx attempts, but if we are short on Tx descriptors,
4740 	 * we always try to poll.
4741 	 */
4742 	if ((ibd_txcomp_poll == 1) &&
4743 	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
4744 		ibd_poll_compq(state, state->id_scq_hdl);
4745 	}
4746 
4747 	/*
4748 	 * Grab required transmit wqes.
4749 	 */
4750 	mutex_enter(&state->id_tx_list.dl_mutex);
4751 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
4752 	if (wqe != NULL) {
4753 		state->id_tx_list.dl_cnt -= 1;
4754 		state->id_tx_list.dl_head = wqe->swqe_next;
4755 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
4756 			state->id_tx_list.dl_tail = NULL;
4757 	} else {
4758 		/*
4759 		 * If we did not find the number we were looking for, flag
4760 		 * no resource. Adjust list appropriately in either case.
4761 		 */
4762 		rc = ENOENT;
4763 		state->id_tx_list.dl_pending_sends = B_TRUE;
4764 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
4765 		atomic_add_64(&state->id_tx_short, 1);
4766 	}
4767 	mutex_exit(&state->id_tx_list.dl_mutex);
4768 	*swqe = wqe;
4769 
4770 	return (rc);
4771 }
4772 
4773 static int
4774 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
4775     ibt_ud_dest_hdl_t ud_dest)
4776 {
4777 	mblk_t	*nmp;
4778 	int iph_len, tcph_len;
4779 	ibt_wr_lso_t *lso;
4780 	uintptr_t ip_start, tcp_start;
4781 	uint8_t *dst;
4782 	uint_t pending, mblen;
4783 
4784 	/*
4785 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
4786 	 * we need to adjust it here for lso.
4787 	 */
4788 	lso = &(node->w_swr.wr.ud_lso);
4789 	lso->lso_ud_dest = ud_dest;
4790 	lso->lso_mss = mss;
4791 
4792 	/*
4793 	 * Calculate the LSO header size and set it in the UD LSO structure.
4794 	 * Note that the only assumption we make is that each of the IPoIB,
4795 	 * IP and TCP headers will be contained in a single mblk fragment;
4796 	 * together, the headers may span multiple mblk fragments.
4797 	 */
4798 	nmp = mp;
4799 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
4800 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
4801 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
4802 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
4803 		nmp = nmp->b_cont;
4804 
4805 	}
4806 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
4807 
4808 	tcp_start = ip_start + iph_len;
4809 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
4810 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
4811 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
4812 		nmp = nmp->b_cont;
4813 	}
4814 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
4815 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
4816 
4817 	/*
4818 	 * If the lso header fits entirely within a single mblk fragment,
4819 	 * we'll avoid an additional copy of the lso header here and just
4820 	 * pass the b_rptr of the mblk directly.
4821 	 *
4822 	 * If this isn't true, we'd have to allocate for it explicitly.
4823 	 */
4824 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
4825 		lso->lso_hdr = mp->b_rptr;
4826 	} else {
4827 		/* On work completion, remember to free this allocated hdr */
4828 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
4829 		if (lso->lso_hdr == NULL) {
4830 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
4831 			    "sz = %d", lso->lso_hdr_sz);
4832 			lso->lso_hdr_sz = 0;
4833 			lso->lso_mss = 0;
4834 			return (-1);
4835 		}
4836 	}
4837 
4838 	/*
4839 	 * Copy in the lso header only if we need to
4840 	 */
4841 	if (lso->lso_hdr != mp->b_rptr) {
4842 		dst = lso->lso_hdr;
4843 		pending = lso->lso_hdr_sz;
4844 
4845 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
4846 			mblen = MBLKL(nmp);
4847 			if (pending > mblen) {
4848 				bcopy(nmp->b_rptr, dst, mblen);
4849 				dst += mblen;
4850 				pending -= mblen;
4851 			} else {
4852 				bcopy(nmp->b_rptr, dst, pending);
4853 				break;
4854 			}
4855 		}
4856 	}
4857 
4858 	return (0);
4859 }
4860 
4861 static void
4862 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
4863 {
4864 	ibt_wr_lso_t *lso;
4865 
4866 	if ((!node) || (!mp))
4867 		return;
4868 
4869 	/*
4870 	 * Free any header space that we might've allocated if we
4871 	 * did an LSO
4872 	 */
4873 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
4874 		lso = &(node->w_swr.wr.ud_lso);
4875 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
4876 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
4877 			lso->lso_hdr = NULL;
4878 			lso->lso_hdr_sz = 0;
4879 		}
4880 	}
4881 }
4882 
4883 static void
4884 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
4885 {
4886 	uint_t		i;
4887 	uint_t		num_posted;
4888 	uint_t		n_wrs;
4889 	ibt_status_t	ibt_status;
4890 	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
4891 	ibd_swqe_t	*elem;
4892 	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
4893 
4894 	node->swqe_next = NULL;
4895 
4896 	mutex_enter(&state->id_txpost_lock);
4897 
4898 	/*
4899 	 * Enqueue the new node in chain of wqes to send
4900 	 */
4901 	if (state->id_tx_head) {
4902 		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
4903 	} else {
4904 		state->id_tx_head = node;
4905 	}
4906 	state->id_tx_tailp = &(node->swqe_next);
4907 
4908 	/*
4909 	 * If someone else is helping out with the sends,
4910 	 * just go back
4911 	 */
4912 	if (state->id_tx_busy) {
4913 		mutex_exit(&state->id_txpost_lock);
4914 		return;
4915 	}
4916 
4917 	/*
4918 	 * Otherwise, mark the flag to indicate that we'll be
4919 	 * doing the dispatch of what's there in the wqe chain
4920 	 */
4921 	state->id_tx_busy = 1;
4922 
4923 	while (state->id_tx_head) {
4924 		/*
4925 		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
4926 		 * at a time if possible, and keep posting them.
4927 		 */
4928 		for (n_wrs = 0, elem = state->id_tx_head;
4929 		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
4930 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
4931 
4932 			nodes[n_wrs] = elem;
4933 			wrs[n_wrs] = elem->w_swr;
4934 		}
4935 		state->id_tx_head = elem;
4936 
4937 		/*
4938 		 * Release the txpost lock before posting the
4939 		 * send request to the hca; if the posting fails
4940 		 * for some reason, we'll never receive completion
4941 		 * intimation, so we'll need to cleanup.
4942 		 */
4943 		mutex_exit(&state->id_txpost_lock);
4944 
4945 		ASSERT(n_wrs != 0);
4946 
4947 		/*
4948 		 * If posting fails for some reason, we'll never receive
4949 		 * completion intimation, so we'll need to cleanup. But
4950 		 * we need to make sure we don't clean up nodes whose
4951 		 * wrs have been successfully posted. We assume that the
4952 		 * hca driver returns on the first failure to post and
4953 		 * therefore the first 'num_posted' entries don't need
4954 		 * cleanup here.
4955 		 */
4956 		num_posted = 0;
4957 		ibt_status = ibt_post_send(state->id_chnl_hdl,
4958 		    wrs, n_wrs, &num_posted);
4959 		if (ibt_status != IBT_SUCCESS) {
4960 
4961 			ibd_print_warn(state, "ibd_post_send: "
4962 			    "posting multiple wrs failed: "
4963 			    "requested=%d, done=%d, ret=%d",
4964 			    n_wrs, num_posted, ibt_status);
4965 
4966 			for (i = num_posted; i < n_wrs; i++)
4967 				ibd_tx_cleanup(state, nodes[i]);
4968 		}
4969 
4970 		/*
4971 		 * Grab the mutex before we go and check the tx Q again
4972 		 */
4973 		mutex_enter(&state->id_txpost_lock);
4974 	}
4975 
4976 	state->id_tx_busy = 0;
4977 	mutex_exit(&state->id_txpost_lock);
4978 }
4979 
4980 static int
4981 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
4982     uint_t lsohdr_sz)
4983 {
4984 	ibt_wr_ds_t *sgl;
4985 	ibt_status_t ibt_status;
4986 	mblk_t *nmp;
4987 	mblk_t *data_mp;
4988 	uchar_t *bufp;
4989 	size_t blksize;
4990 	size_t skip;
4991 	size_t avail;
4992 	uint_t pktsize;
4993 	uint_t frag_len;
4994 	uint_t pending_hdr;
4995 	uint_t hiwm;
4996 	int nmblks;
4997 	int i;
4998 
4999 	/*
5000 	 * Let's skip ahead to the data if this is LSO
5001 	 */
5002 	data_mp = mp;
5003 	pending_hdr = 0;
5004 	if (lsohdr_sz) {
5005 		pending_hdr = lsohdr_sz;
5006 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5007 			frag_len = nmp->b_wptr - nmp->b_rptr;
5008 			if (frag_len > pending_hdr)
5009 				break;
5010 			pending_hdr -= frag_len;
5011 		}
5012 		data_mp = nmp;	/* start of data past lso header */
5013 		ASSERT(data_mp != NULL);
5014 	}
5015 
5016 	/*
5017 	 * Calculate the size of message data and number of msg blocks
5018 	 */
5019 	pktsize = 0;
5020 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5021 	    nmp = nmp->b_cont, nmblks++) {
5022 		pktsize += MBLKL(nmp);
5023 	}
5024 	pktsize -= pending_hdr;
5025 
5026 	/*
5027 	 * Translating the virtual address regions into physical regions
5028 	 * for using the Reserved LKey feature results in a wr sgl that
5029 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
5030 	 * we'll fix a high-water mark (65%) for when we should stop.
5031 	 */
5032 	hiwm = (state->id_max_sqseg * 65) / 100;
5033 
5034 	/*
5035 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5036 	 * "copy-threshold", and if the number of mp fragments is less than
5037 	 * the maximum acceptable.
5038 	 */
5039 	if ((state->id_hca_res_lkey_capab) &&
5040 	    (pktsize > IBD_TX_COPY_THRESH) &&
5041 	    (nmblks < hiwm)) {
5042 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5043 		ibt_iov_attr_t iov_attr;
5044 
5045 		iov_attr.iov_as = NULL;
5046 		iov_attr.iov = iov_arr;
5047 		iov_attr.iov_buf = NULL;
5048 		iov_attr.iov_list_len = nmblks;
5049 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5050 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5051 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5052 
5053 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5054 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5055 			iov_arr[i].iov_len = MBLKL(nmp);
5056 			if (i == 0) {
5057 				iov_arr[i].iov_addr += pending_hdr;
5058 				iov_arr[i].iov_len -= pending_hdr;
5059 			}
5060 		}
5061 
5062 		node->w_buftype = IBD_WQE_MAPPED;
5063 		node->w_swr.wr_sgl = node->w_sgl;
5064 
5065 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5066 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5067 		if (ibt_status != IBT_SUCCESS) {
5068 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5069 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5070 			goto ibd_copy_path;
5071 		}
5072 
5073 		return (0);
5074 	}
5075 
5076 ibd_copy_path:
5077 	if (pktsize <= state->id_tx_buf_sz) {
5078 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5079 		node->w_swr.wr_nds = 1;
5080 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5081 		node->w_buftype = IBD_WQE_TXBUF;
5082 
5083 		/*
5084 		 * Even though this is the copy path for transfers less than
5085 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5086 		 * is possible the first data mblk fragment (data_mp) still
5087 		 * contains part of the LSO header that we need to skip.
5088 		 */
5089 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5090 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5091 			blksize = MBLKL(nmp) - pending_hdr;
5092 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5093 			bufp += blksize;
5094 			pending_hdr = 0;
5095 		}
5096 
5097 		return (0);
5098 	}
5099 
5100 	/*
5101 	 * Copy path for transfers greater than id_tx_buf_sz
5102 	 */
5103 	node->w_swr.wr_sgl = node->w_sgl;
5104 	if (ibd_acquire_lsobufs(state, pktsize,
5105 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5106 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5107 		return (-1);
5108 	}
5109 	node->w_buftype = IBD_WQE_LSOBUF;
5110 
5111 	/*
5112 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5113 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5114 	 * need to skip part of the LSO header in the first fragment
5115 	 * as before.
5116 	 */
5117 	nmp = data_mp;
5118 	skip = pending_hdr;
5119 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5120 		sgl = node->w_swr.wr_sgl + i;
5121 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5122 		avail = IBD_LSO_BUFSZ;
5123 		while (nmp && avail) {
5124 			blksize = MBLKL(nmp) - skip;
5125 			if (blksize > avail) {
5126 				bcopy(nmp->b_rptr + skip, bufp, avail);
5127 				skip += avail;
5128 				avail = 0;
5129 			} else {
5130 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5131 				skip = 0;
5132 				avail -= blksize;
5133 				bufp += blksize;
5134 				nmp = nmp->b_cont;
5135 			}
5136 		}
5137 	}
5138 
5139 	return (0);
5140 }
5141 
5142 /*
5143  * Schedule a completion queue polling to reap the resource we're
5144  * short on.  If we implement the change to reap tx completions
5145  * in a separate thread, we'll need to wake up that thread here.
5146  */
5147 static int
5148 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5149 {
5150 	ibd_req_t *req;
5151 
5152 	mutex_enter(&state->id_sched_lock);
5153 	state->id_sched_needed |= resource_type;
5154 	mutex_exit(&state->id_sched_lock);
5155 
5156 	/*
5157 	 * If we are asked to queue a work entry, we need to do it
5158 	 */
5159 	if (q_flag) {
5160 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5161 		if (req == NULL)
5162 			return (-1);
5163 
5164 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5165 	}
5166 
5167 	return (0);
5168 }
5169 
5170 /*
5171  * The passed in packet has this format:
5172  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5173  */
5174 static boolean_t
5175 ibd_send(ibd_state_t *state, mblk_t *mp)
5176 {
5177 	ibd_ace_t *ace;
5178 	ibd_swqe_t *node;
5179 	ipoib_mac_t *dest;
5180 	ib_header_info_t *ipibp;
5181 	ip6_t *ip6h;
5182 	uint_t pktsize;
5183 	uint32_t mss;
5184 	uint32_t hckflags;
5185 	uint32_t lsoflags = 0;
5186 	uint_t lsohdr_sz = 0;
5187 	int ret, len;
5188 	boolean_t dofree = B_FALSE;
5189 	boolean_t rc;
5190 
5191 	node = NULL;
5192 	if (ibd_acquire_swqe(state, &node) != 0) {
5193 		/*
5194 		 * If we don't have an swqe available, schedule a transmit
5195 		 * completion queue cleanup and hold off on sending more
5196 		 * more packets until we have some free swqes
5197 		 */
5198 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
5199 			return (B_FALSE);
5200 
5201 		/*
5202 		 * If a poll cannot be scheduled, we have no choice but
5203 		 * to drop this packet
5204 		 */
5205 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5206 		return (B_TRUE);
5207 	}
5208 
5209 	/*
5210 	 * Initialize the commonly used fields in swqe to NULL to protect
5211 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5212 	 * failure.
5213 	 */
5214 	node->swqe_im_mblk = NULL;
5215 	node->w_swr.wr_nds = 0;
5216 	node->w_swr.wr_sgl = NULL;
5217 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5218 
5219 	/*
5220 	 * Obtain an address handle for the destination.
5221 	 */
5222 	ipibp = (ib_header_info_t *)mp->b_rptr;
5223 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5224 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5225 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5226 
5227 	pktsize = msgsize(mp);
5228 
5229 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5230 	atomic_inc_64(&state->id_xmt_pkt);
5231 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5232 		atomic_inc_64(&state->id_brd_xmt);
5233 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5234 		atomic_inc_64(&state->id_multi_xmt);
5235 
5236 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5237 		node->w_ahandle = ace;
5238 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5239 	} else {
5240 		DPRINT(5,
5241 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5242 		    ((ret == EFAULT) ? "failed" : "queued"),
5243 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5244 		    htonl(dest->ipoib_gidpref[1]),
5245 		    htonl(dest->ipoib_gidsuff[0]),
5246 		    htonl(dest->ipoib_gidsuff[1]));
5247 		node->w_ahandle = NULL;
5248 
5249 		/*
5250 		 * for the poll mode, it is probably some cqe pending in the
5251 		 * cq. So ibd has to poll cq here, otherwise acache probably
5252 		 * may not be recycled.
5253 		 */
5254 		if (ibd_txcomp_poll == 1)
5255 			ibd_poll_compq(state, state->id_scq_hdl);
5256 
5257 		/*
5258 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5259 		 * can not find a path for the specific dest address. We
5260 		 * should get rid of this kind of packet.  We also should get
5261 		 * rid of the packet if we cannot schedule a poll via the
5262 		 * async thread.  For the normal case, ibd will return the
5263 		 * packet to upper layer and wait for AH creating.
5264 		 *
5265 		 * Note that we always queue a work slot entry for the async
5266 		 * thread when we fail AH lookup (even in intr mode); this is
5267 		 * due to the convoluted way the code currently looks for AH.
5268 		 */
5269 		if (ret == EFAULT) {
5270 			dofree = B_TRUE;
5271 			rc = B_TRUE;
5272 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5273 			dofree = B_TRUE;
5274 			rc = B_TRUE;
5275 		} else {
5276 			dofree = B_FALSE;
5277 			rc = B_FALSE;
5278 		}
5279 		goto ibd_send_fail;
5280 	}
5281 
5282 	/*
5283 	 * For ND6 packets, padding is at the front of the source lladdr.
5284 	 * Insert the padding at front.
5285 	 */
5286 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) {
5287 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5288 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5289 			    sizeof (ib_header_info_t))) {
5290 				DPRINT(10, "ibd_send: pullupmsg failure ");
5291 				dofree = B_TRUE;
5292 				rc = B_TRUE;
5293 				goto ibd_send_fail;
5294 			}
5295 			ipibp = (ib_header_info_t *)mp->b_rptr;
5296 		}
5297 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5298 		    sizeof (ib_header_info_t));
5299 		len = ntohs(ip6h->ip6_plen);
5300 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5301 			mblk_t	*pad;
5302 
5303 			pad = allocb(4, 0);
5304 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5305 			linkb(mp, pad);
5306 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5307 			    IPV6_HDR_LEN + len + 4) {
5308 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5309 				    IPV6_HDR_LEN + len + 4)) {
5310 					DPRINT(10, "ibd_send: pullupmsg "
5311 					    "failure ");
5312 					dofree = B_TRUE;
5313 					rc = B_TRUE;
5314 					goto ibd_send_fail;
5315 				}
5316 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5317 				    sizeof (ib_header_info_t));
5318 			}
5319 
5320 			/* LINTED: E_CONSTANT_CONDITION */
5321 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5322 		}
5323 	}
5324 
5325 	mp->b_rptr += sizeof (ib_addrs_t);
5326 
5327 	/*
5328 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5329 	 * ud destination, the opcode and the LSO header information to the
5330 	 * work request.
5331 	 */
5332 	lso_info_get(mp, &mss, &lsoflags);
5333 	if ((lsoflags & HW_LSO) != HW_LSO) {
5334 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5335 		lsohdr_sz = 0;
5336 	} else {
5337 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5338 			/*
5339 			 * The routine can only fail if there's no memory; we
5340 			 * can only drop the packet if this happens
5341 			 */
5342 			ibd_print_warn(state,
5343 			    "ibd_send: no memory, lso posting failed");
5344 			dofree = B_TRUE;
5345 			rc = B_TRUE;
5346 			goto ibd_send_fail;
5347 		}
5348 
5349 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5350 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5351 	}
5352 
5353 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5354 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5355 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5356 	else
5357 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5358 
5359 	/*
5360 	 * Prepare the sgl for posting; the routine can only fail if there's
5361 	 * no lso buf available for posting. If this is the case, we should
5362 	 * probably resched for lso bufs to become available and then try again.
5363 	 */
5364 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5365 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5366 			dofree = B_TRUE;
5367 			rc = B_TRUE;
5368 		} else {
5369 			dofree = B_FALSE;
5370 			rc = B_FALSE;
5371 		}
5372 		goto ibd_send_fail;
5373 	}
5374 	node->swqe_im_mblk = mp;
5375 
5376 	/*
5377 	 * Queue the wqe to hardware; since we can now simply queue a
5378 	 * post instead of doing it serially, we cannot assume anything
5379 	 * about the 'node' after ibd_post_send() returns.
5380 	 */
5381 	ibd_post_send(state, node);
5382 
5383 	return (B_TRUE);
5384 
5385 ibd_send_fail:
5386 	if (node && mp)
5387 		ibd_free_lsohdr(node, mp);
5388 
5389 	if (dofree)
5390 		freemsg(mp);
5391 
5392 	if (node != NULL)
5393 		ibd_tx_cleanup(state, node);
5394 
5395 	return (rc);
5396 }
5397 
5398 /*
5399  * GLDv3 entry point for transmitting datagram.
5400  */
5401 static mblk_t *
5402 ibd_m_tx(void *arg, mblk_t *mp)
5403 {
5404 	ibd_state_t *state = (ibd_state_t *)arg;
5405 	mblk_t *next;
5406 
5407 	while (mp != NULL) {
5408 		next = mp->b_next;
5409 		mp->b_next = NULL;
5410 		if (ibd_send(state, mp) == B_FALSE) {
5411 			/* Send fail */
5412 			mp->b_next = next;
5413 			break;
5414 		}
5415 		mp = next;
5416 	}
5417 
5418 	return (mp);
5419 }
5420 
5421 /*
5422  * this handles Tx and Rx completions. With separate CQs, this handles
5423  * only Rx completions.
5424  */
5425 static uint_t
5426 ibd_intr(char *arg)
5427 {
5428 	ibd_state_t *state = (ibd_state_t *)arg;
5429 
5430 	ibd_poll_compq(state, state->id_rcq_hdl);
5431 
5432 	return (DDI_INTR_CLAIMED);
5433 }
5434 
5435 /*
5436  * Poll and drain the cq
5437  */
5438 static uint_t
5439 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
5440     uint_t numwcs)
5441 {
5442 	ibd_wqe_t *wqe;
5443 	ibt_wc_t *wc;
5444 	uint_t total_polled = 0;
5445 	uint_t num_polled;
5446 	int i;
5447 
5448 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5449 		total_polled += num_polled;
5450 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5451 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5452 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5453 			    (wqe->w_type == IBD_WQE_RECV));
5454 			if (wc->wc_status != IBT_WC_SUCCESS) {
5455 				/*
5456 				 * Channel being torn down.
5457 				 */
5458 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5459 					DPRINT(5, "ibd_drain_cq: flush error");
5460 					/*
5461 					 * Only invoke the Tx handler to
5462 					 * release possibly held resources
5463 					 * like AH refcount etc. Can not
5464 					 * invoke Rx handler because it might
5465 					 * try adding buffers to the Rx pool
5466 					 * when we are trying to deinitialize.
5467 					 */
5468 					if (wqe->w_type == IBD_WQE_RECV) {
5469 						continue;
5470 					} else {
5471 						DPRINT(10, "ibd_drain_cq: Bad "
5472 						    "status %d", wc->wc_status);
5473 					}
5474 				}
5475 			}
5476 			if (wqe->w_type == IBD_WQE_SEND) {
5477 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
5478 			} else {
5479 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5480 			}
5481 		}
5482 	}
5483 
5484 	return (total_polled);
5485 }
5486 
5487 /*
5488  * Common code for interrupt handling as well as for polling
5489  * for all completed wqe's while detaching.
5490  */
5491 static void
5492 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5493 {
5494 	ibt_wc_t *wcs;
5495 	uint_t numwcs;
5496 	int flag, redo_flag;
5497 	int redo = 1;
5498 	uint_t num_polled = 0;
5499 
5500 	if (ibd_separate_cqs == 1) {
5501 		if (cq_hdl == state->id_rcq_hdl) {
5502 			flag = IBD_RX_CQ_POLLING;
5503 			redo_flag = IBD_REDO_RX_CQ_POLLING;
5504 		} else {
5505 			flag = IBD_TX_CQ_POLLING;
5506 			redo_flag = IBD_REDO_TX_CQ_POLLING;
5507 		}
5508 	} else {
5509 		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
5510 		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
5511 	}
5512 
5513 	mutex_enter(&state->id_cq_poll_lock);
5514 	if (state->id_cq_poll_busy & flag) {
5515 		state->id_cq_poll_busy |= redo_flag;
5516 		mutex_exit(&state->id_cq_poll_lock);
5517 		return;
5518 	}
5519 	state->id_cq_poll_busy |= flag;
5520 	mutex_exit(&state->id_cq_poll_lock);
5521 
5522 	/*
5523 	 * In some cases (eg detaching), this code can be invoked on
5524 	 * any cpu after disabling cq notification (thus no concurrency
5525 	 * exists). Apart from that, the following applies normally:
5526 	 * The receive completion handling is always on the Rx interrupt
5527 	 * cpu. Transmit completion handling could be from any cpu if
5528 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5529 	 * is interrupt driven. Combined completion handling is always
5530 	 * on the interrupt cpu. Thus, lock accordingly and use the
5531 	 * proper completion array.
5532 	 */
5533 	if (ibd_separate_cqs == 1) {
5534 		if (cq_hdl == state->id_rcq_hdl) {
5535 			wcs = state->id_rxwcs;
5536 			numwcs = state->id_rxwcs_size;
5537 		} else {
5538 			wcs = state->id_txwcs;
5539 			numwcs = state->id_txwcs_size;
5540 		}
5541 	} else {
5542 		wcs = state->id_rxwcs;
5543 		numwcs = state->id_rxwcs_size;
5544 	}
5545 
5546 	/*
5547 	 * Poll and drain the CQ
5548 	 */
5549 	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5550 
5551 	/*
5552 	 * Enable CQ notifications and redrain the cq to catch any
5553 	 * completions we might have missed after the ibd_drain_cq()
5554 	 * above and before the ibt_enable_cq_notify() that follows.
5555 	 * Finally, service any new requests to poll the cq that
5556 	 * could've come in after the ibt_enable_cq_notify().
5557 	 */
5558 	do {
5559 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
5560 		    IBT_SUCCESS) {
5561 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5562 		}
5563 
5564 		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5565 
5566 		mutex_enter(&state->id_cq_poll_lock);
5567 		if (state->id_cq_poll_busy & redo_flag)
5568 			state->id_cq_poll_busy &= ~redo_flag;
5569 		else {
5570 			state->id_cq_poll_busy &= ~flag;
5571 			redo = 0;
5572 		}
5573 		mutex_exit(&state->id_cq_poll_lock);
5574 
5575 	} while (redo);
5576 
5577 	/*
5578 	 * If we polled the receive cq and found anything, we need to flush
5579 	 * it out to the nw layer here.
5580 	 */
5581 	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
5582 		ibd_flush_rx(state, NULL);
5583 	}
5584 }
5585 
5586 /*
5587  * Unmap the memory area associated with a given swqe.
5588  */
5589 static void
5590 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
5591 {
5592 	ibt_status_t stat;
5593 
5594 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
5595 
5596 	if (swqe->w_mi_hdl) {
5597 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
5598 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
5599 			DPRINT(10,
5600 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
5601 		}
5602 		swqe->w_mi_hdl = NULL;
5603 	}
5604 	swqe->w_swr.wr_nds = 0;
5605 }
5606 
5607 /*
5608  * Common code that deals with clean ups after a successful or
5609  * erroneous transmission attempt.
5610  */
5611 static void
5612 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
5613 {
5614 	ibd_ace_t *ace = swqe->w_ahandle;
5615 
5616 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
5617 
5618 	/*
5619 	 * If this was a dynamic mapping in ibd_send(), we need to
5620 	 * unmap here. If this was an lso buffer we'd used for sending,
5621 	 * we need to release the lso buf to the pool, since the resource
5622 	 * is scarce. However, if this was simply a normal send using
5623 	 * the copybuf (present in each swqe), we don't need to release it.
5624 	 */
5625 	if (swqe->swqe_im_mblk != NULL) {
5626 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
5627 			ibd_unmap_mem(state, swqe);
5628 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
5629 			ibd_release_lsobufs(state,
5630 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
5631 		}
5632 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
5633 		freemsg(swqe->swqe_im_mblk);
5634 		swqe->swqe_im_mblk = NULL;
5635 	}
5636 
5637 	/*
5638 	 * Drop the reference count on the AH; it can be reused
5639 	 * now for a different destination if there are no more
5640 	 * posted sends that will use it. This can be eliminated
5641 	 * if we can always associate each Tx buffer with an AH.
5642 	 * The ace can be null if we are cleaning up from the
5643 	 * ibd_send() error path.
5644 	 */
5645 	if (ace != NULL) {
5646 		/*
5647 		 * The recycling logic can be eliminated from here
5648 		 * and put into the async thread if we create another
5649 		 * list to hold ACE's for unjoined mcg's.
5650 		 */
5651 		if (DEC_REF_DO_CYCLE(ace)) {
5652 			ibd_mce_t *mce;
5653 
5654 			/*
5655 			 * Check with the lock taken: we decremented
5656 			 * reference count without the lock, and some
5657 			 * transmitter might alreay have bumped the
5658 			 * reference count (possible in case of multicast
5659 			 * disable when we leave the AH on the active
5660 			 * list). If not still 0, get out, leaving the
5661 			 * recycle bit intact.
5662 			 *
5663 			 * Atomically transition the AH from active
5664 			 * to free list, and queue a work request to
5665 			 * leave the group and destroy the mce. No
5666 			 * transmitter can be looking at the AH or
5667 			 * the MCE in between, since we have the
5668 			 * ac_mutex lock. In the SendOnly reap case,
5669 			 * it is not neccesary to hold the ac_mutex
5670 			 * and recheck the ref count (since the AH was
5671 			 * taken off the active list), we just do it
5672 			 * to have uniform processing with the Full
5673 			 * reap case.
5674 			 */
5675 			mutex_enter(&state->id_ac_mutex);
5676 			mce = ace->ac_mce;
5677 			if (GET_REF_CYCLE(ace) == 0) {
5678 				CLEAR_REFCYCLE(ace);
5679 				/*
5680 				 * Identify the case of fullmember reap as
5681 				 * opposed to mcg trap reap. Also, port up
5682 				 * might set ac_mce to NULL to indicate Tx
5683 				 * cleanup should do no more than put the
5684 				 * AH in the free list (see ibd_async_link).
5685 				 */
5686 				if (mce != NULL) {
5687 					ace->ac_mce = NULL;
5688 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
5689 					/*
5690 					 * mc_req was initialized at mce
5691 					 * creation time.
5692 					 */
5693 					ibd_queue_work_slot(state,
5694 					    &mce->mc_req, IBD_ASYNC_REAP);
5695 				}
5696 				IBD_ACACHE_INSERT_FREE(state, ace);
5697 			}
5698 			mutex_exit(&state->id_ac_mutex);
5699 		}
5700 	}
5701 
5702 	/*
5703 	 * Release the send wqe for reuse.
5704 	 */
5705 	ibd_release_swqe(state, swqe);
5706 }
5707 
5708 /*
5709  * Hand off the processed rx mp chain to mac_rx()
5710  */
5711 static void
5712 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
5713 {
5714 	if (mpc == NULL) {
5715 		mutex_enter(&state->id_rx_lock);
5716 
5717 		mpc = state->id_rx_mp;
5718 
5719 		state->id_rx_mp = NULL;
5720 		state->id_rx_mp_tail = NULL;
5721 		state->id_rx_mp_len = 0;
5722 
5723 		mutex_exit(&state->id_rx_lock);
5724 	}
5725 
5726 	if (mpc) {
5727 		mac_rx(state->id_mh, state->id_rh, mpc);
5728 	}
5729 }
5730 
5731 /*
5732  * Processing to be done after receipt of a packet; hand off to GLD
5733  * in the format expected by GLD.  The received packet has this
5734  * format: 2b sap :: 00 :: data.
5735  */
5736 static void
5737 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
5738 {
5739 	ib_header_info_t *phdr;
5740 	mblk_t *mp;
5741 	mblk_t *mpc = NULL;
5742 	ipoib_hdr_t *ipibp;
5743 	ipha_t *iphap;
5744 	ip6_t *ip6h;
5745 	int rxcnt, len;
5746 
5747 	/*
5748 	 * Track number handed to upper layer, and number still
5749 	 * available to receive packets.
5750 	 */
5751 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
5752 	ASSERT(rxcnt >= 0);
5753 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
5754 
5755 	/*
5756 	 * Adjust write pointer depending on how much data came in.
5757 	 */
5758 	mp = rwqe->rwqe_im_mblk;
5759 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
5760 
5761 	/*
5762 	 * Make sure this is NULL or we're in trouble.
5763 	 */
5764 	if (mp->b_next != NULL) {
5765 		ibd_print_warn(state,
5766 		    "ibd_process_rx: got duplicate mp from rcq?");
5767 		mp->b_next = NULL;
5768 	}
5769 
5770 	/*
5771 	 * the IB link will deliver one of the IB link layer
5772 	 * headers called, the Global Routing Header (GRH).
5773 	 * ibd driver uses the information in GRH to build the
5774 	 * Header_info structure and pass it with the datagram up
5775 	 * to GLDv3.
5776 	 * If the GRH is not valid, indicate to GLDv3 by setting
5777 	 * the VerTcFlow field to 0.
5778 	 */
5779 	phdr = (ib_header_info_t *)mp->b_rptr;
5780 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
5781 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
5782 
5783 		/* if it is loop back packet, just drop it. */
5784 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
5785 		    IPOIB_ADDRL) == 0) {
5786 			freemsg(mp);
5787 			return;
5788 		}
5789 
5790 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
5791 		    sizeof (ipoib_mac_t));
5792 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
5793 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
5794 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
5795 		} else {
5796 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
5797 		}
5798 	} else {
5799 		/*
5800 		 * It can not be a IBA multicast packet. Must have been
5801 		 * unicast for us. Just copy the interface address to dst.
5802 		 */
5803 		phdr->ib_grh.ipoib_vertcflow = 0;
5804 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
5805 		    sizeof (ipoib_mac_t));
5806 	}
5807 
5808 	/*
5809 	 * For ND6 packets, padding is at the front of the source/target
5810 	 * lladdr. However the inet6 layer is not aware of it, hence remove
5811 	 * the padding from such packets.
5812 	 */
5813 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
5814 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
5815 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
5816 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5817 			    sizeof (ipoib_hdr_t))) {
5818 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
5819 				freemsg(mp);
5820 				return;
5821 			}
5822 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
5823 			    sizeof (ipoib_pgrh_t));
5824 		}
5825 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
5826 		len = ntohs(ip6h->ip6_plen);
5827 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5828 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
5829 			    IPV6_HDR_LEN + len) {
5830 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
5831 				    IPV6_HDR_LEN + len)) {
5832 					DPRINT(10, "ibd_process_rx: pullupmsg"
5833 					    " failed");
5834 					freemsg(mp);
5835 					return;
5836 				}
5837 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5838 				    sizeof (ipoib_pgrh_t) +
5839 				    sizeof (ipoib_hdr_t));
5840 			}
5841 			/* LINTED: E_CONSTANT_CONDITION */
5842 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
5843 		}
5844 	}
5845 
5846 	/*
5847 	 * Update statistics
5848 	 */
5849 	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
5850 	atomic_inc_64(&state->id_rcv_pkt);
5851 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5852 		atomic_inc_64(&state->id_brd_rcv);
5853 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5854 		atomic_inc_64(&state->id_multi_rcv);
5855 
5856 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
5857 	/*
5858 	 * Set receive checksum status in mp
5859 	 * Hardware checksumming can be considered valid only if:
5860 	 * 1. CQE.IP_OK bit is set
5861 	 * 2. CQE.CKSUM = 0xffff
5862 	 * 3. IPv6 routing header is not present in the packet
5863 	 * 4. If there are no IP_OPTIONS in the IP HEADER
5864 	 */
5865 
5866 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
5867 	    (wc->wc_cksum == 0xFFFF) &&
5868 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
5869 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
5870 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
5871 	}
5872 
5873 	/*
5874 	 * Add this mp to the list of processed mp's to send to
5875 	 * the nw layer
5876 	 */
5877 	mutex_enter(&state->id_rx_lock);
5878 	if (state->id_rx_mp) {
5879 		ASSERT(state->id_rx_mp_tail != NULL);
5880 		state->id_rx_mp_tail->b_next = mp;
5881 	} else {
5882 		ASSERT(state->id_rx_mp_tail == NULL);
5883 		state->id_rx_mp = mp;
5884 	}
5885 
5886 	state->id_rx_mp_tail = mp;
5887 	state->id_rx_mp_len++;
5888 
5889 	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
5890 		mpc = state->id_rx_mp;
5891 
5892 		state->id_rx_mp = NULL;
5893 		state->id_rx_mp_tail = NULL;
5894 		state->id_rx_mp_len = 0;
5895 	}
5896 
5897 	mutex_exit(&state->id_rx_lock);
5898 
5899 	if (mpc) {
5900 		ibd_flush_rx(state, mpc);
5901 	}
5902 }
5903 
5904 /*
5905  * Callback code invoked from STREAMs when the receive data buffer is
5906  * free for recycling.
5907  */
5908 static void
5909 ibd_freemsg_cb(char *arg)
5910 {
5911 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
5912 	ibd_state_t *state = rwqe->w_state;
5913 
5914 	/*
5915 	 * If the wqe is being destructed, do not attempt recycling.
5916 	 */
5917 	if (rwqe->w_freeing_wqe == B_TRUE) {
5918 		DPRINT(6, "ibd_freemsg: wqe being freed");
5919 		return;
5920 	} else {
5921 		/*
5922 		 * Upper layer has released held mblk, so we have
5923 		 * no more use for keeping the old pointer in
5924 		 * our rwqe.
5925 		 */
5926 		rwqe->rwqe_im_mblk = NULL;
5927 	}
5928 
5929 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
5930 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
5931 	if (rwqe->rwqe_im_mblk == NULL) {
5932 		ibd_delete_rwqe(state, rwqe);
5933 		ibd_free_rwqe(state, rwqe);
5934 		DPRINT(6, "ibd_freemsg: desballoc failed");
5935 		return;
5936 	}
5937 
5938 	if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) {
5939 		ibd_delete_rwqe(state, rwqe);
5940 		ibd_free_rwqe(state, rwqe);
5941 		return;
5942 	}
5943 
5944 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
5945 }
5946 
5947 static uint_t
5948 ibd_tx_recycle(char *arg)
5949 {
5950 	ibd_state_t *state = (ibd_state_t *)arg;
5951 
5952 	/*
5953 	 * Poll for completed entries
5954 	 */
5955 	ibd_poll_compq(state, state->id_scq_hdl);
5956 
5957 	/*
5958 	 * Resume any blocked transmissions if possible
5959 	 */
5960 	(void) ibd_resume_transmission(state);
5961 
5962 	return (DDI_INTR_CLAIMED);
5963 }
5964 
5965 #ifdef IBD_LOGGING
5966 static void
5967 ibd_log_init(void)
5968 {
5969 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
5970 	ibd_lbuf_ndx = 0;
5971 }
5972 
5973 static void
5974 ibd_log_fini(void)
5975 {
5976 	if (ibd_lbuf)
5977 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
5978 	ibd_lbuf_ndx = 0;
5979 	ibd_lbuf = NULL;
5980 }
5981 
5982 static void
5983 ibd_log(const char *fmt, ...)
5984 {
5985 	va_list	ap;
5986 	uint32_t off;
5987 	uint32_t msglen;
5988 	char tmpbuf[IBD_DMAX_LINE];
5989 
5990 	if (ibd_lbuf == NULL)
5991 		return;
5992 
5993 	va_start(ap, fmt);
5994 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
5995 	va_end(ap);
5996 
5997 	if (msglen >= IBD_DMAX_LINE)
5998 		msglen = IBD_DMAX_LINE - 1;
5999 
6000 	mutex_enter(&ibd_lbuf_lock);
6001 
6002 	off = ibd_lbuf_ndx;		/* current msg should go here */
6003 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6004 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6005 
6006 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6007 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6008 
6009 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6010 		ibd_lbuf_ndx = 0;
6011 
6012 	mutex_exit(&ibd_lbuf_lock);
6013 
6014 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6015 }
6016 #endif
6017