xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 657a8c206b913d1ee578fd725f0b25eca5b77253)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <inet/tcp.h>		/* for tcph_t */
56 #include <netinet/icmp6.h>	/* for icmp6_t */
57 #include <sys/callb.h>
58 #include <sys/modhash.h>
59 
60 #include <sys/ib/clients/ibd/ibd.h>
61 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
62 #include <sys/note.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Per-interface tunables
69  *
70  * ibd_tx_copy_thresh
71  *     This sets the threshold at which ibd will attempt to do a bcopy of the
72  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
73  *     is restricted by various parameters, so setting of this value must be
74  *     made after careful considerations only.  For instance, IB HCAs currently
75  *     impose a relatively small limit (when compared to ethernet NICs) on the
76  *     length of the SGL for transmit. On the other hand, the ip stack could
77  *     send down mp chains that are quite long when LSO is enabled.
78  *
79  * ibd_num_swqe
80  *     Number of "send WQE" elements that will be allocated and used by ibd.
81  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
82  *     buffer in each of these send wqes must be taken into account. This
83  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
84  *     currently set to the same value of ibd_tx_copy_thresh, but may be
85  *     changed independently if needed).
86  *
87  * ibd_num_rwqe
88  *     Number of "receive WQE" elements that will be allocated and used by
89  *     ibd. This parameter is limited by the maximum channel size of the HCA.
90  *     Each buffer in the receive wqe will be of MTU size.
91  *
92  * ibd_num_lso_bufs
93  *     Number of "larger-than-MTU" copy buffers to use for cases when the
94  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
95  *     and too large to be used with regular MTU-sized copy buffers. It is
96  *     not recommended to tune this variable without understanding the
97  *     application environment and/or memory resources. The size of each of
98  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
99  *
100  * ibd_num_ah
101  *     Number of AH cache entries to allocate
102  *
103  * ibd_hash_size
104  *     Hash table size for the active AH list
105  *
106  * ibd_separate_cqs
107  * ibd_txcomp_poll
108  *     These boolean variables (1 or 0) may be used to tune the behavior of
109  *     ibd in managing the send and receive completion queues and in deciding
110  *     whether or not transmit completions should be polled or interrupt
111  *     driven (when the completion queues are separate). If both the completion
112  *     queues are interrupt driven, it may not be possible for the handlers to
113  *     be invoked concurrently, depending on how the interrupts are tied on
114  *     the PCI intr line.  Note that some combination of these two parameters
115  *     may not be meaningful (and therefore not allowed).
116  *
117  * ibd_tx_softintr
118  * ibd_rx_softintr
119  *     The softintr mechanism allows ibd to avoid event queue overflows if
120  *     the receive/completion handlers are to be expensive. These are enabled
121  *     by default.
122  *
123  * ibd_log_sz
124  *     This specifies the size of the ibd log buffer in bytes. The buffer is
125  *     allocated and logging is enabled only when IBD_LOGGING is defined.
126  *
127  */
128 uint_t ibd_tx_copy_thresh = 0x1000;
129 uint_t ibd_num_swqe = 4000;
130 uint_t ibd_num_rwqe = 4000;
131 uint_t ibd_num_lso_bufs = 0x400;
132 uint_t ibd_num_ah = 64;
133 uint_t ibd_hash_size = 32;
134 uint_t ibd_separate_cqs = 1;
135 uint_t ibd_txcomp_poll = 0;
136 uint_t ibd_rx_softintr = 1;
137 uint_t ibd_tx_softintr = 1;
138 #ifdef IBD_LOGGING
139 uint_t ibd_log_sz = 0x20000;
140 #endif
141 
142 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
143 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
144 #define	IBD_NUM_SWQE			ibd_num_swqe
145 #define	IBD_NUM_RWQE			ibd_num_rwqe
146 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
147 #define	IBD_NUM_AH			ibd_num_ah
148 #define	IBD_HASH_SIZE			ibd_hash_size
149 #ifdef IBD_LOGGING
150 #define	IBD_LOG_SZ			ibd_log_sz
151 #endif
152 
153 /*
154  * Receive CQ moderation parameters: NOT tunables
155  */
156 static uint_t ibd_rxcomp_count = 4;
157 static uint_t ibd_rxcomp_usec = 10;
158 
159 /*
160  * Send CQ moderation parameters: NOT tunables
161  */
162 #define	IBD_TXCOMP_COUNT		10
163 #define	IBD_TXCOMP_USEC			300
164 
165 /*
166  * Thresholds
167  *
168  * When waiting for resources (swqes or lso buffers) to become available,
169  * the first two thresholds below determine how long to wait before informing
170  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
171  * determines how low the available swqes should go before we start polling
172  * the completion queue.
173  */
174 #define	IBD_FREE_LSOS_THRESH		8
175 #define	IBD_FREE_SWQES_THRESH		20
176 #define	IBD_TX_POLL_THRESH		80
177 
178 /*
179  * When doing multiple-send-wr or multiple-recv-wr posts, this value
180  * determines how many to do at a time (in a single ibt_post_send/recv).
181  */
182 #define	IBD_MAX_POST_MULTIPLE		4
183 
184 /*
185  * Maximum length for returning chained mps back to crossbow
186  */
187 #define	IBD_MAX_RX_MP_LEN		16
188 
189 /*
190  * LSO parameters
191  */
192 #define	IBD_LSO_MAXLEN			65536
193 #define	IBD_LSO_BUFSZ			8192
194 #define	IBD_PROP_LSO_POLICY		"lso-policy"
195 
196 /*
197  * Completion queue polling control
198  */
199 #define	IBD_RX_CQ_POLLING		0x1
200 #define	IBD_TX_CQ_POLLING		0x2
201 #define	IBD_REDO_RX_CQ_POLLING		0x4
202 #define	IBD_REDO_TX_CQ_POLLING		0x8
203 
204 /*
205  * Flag bits for resources to reap
206  */
207 #define	IBD_RSRC_SWQE			0x1
208 #define	IBD_RSRC_LSOBUF			0x2
209 
210 /*
211  * Async operation types
212  */
213 #define	IBD_ASYNC_GETAH			1
214 #define	IBD_ASYNC_JOIN			2
215 #define	IBD_ASYNC_LEAVE			3
216 #define	IBD_ASYNC_PROMON		4
217 #define	IBD_ASYNC_PROMOFF		5
218 #define	IBD_ASYNC_REAP			6
219 #define	IBD_ASYNC_TRAP			7
220 #define	IBD_ASYNC_SCHED			8
221 #define	IBD_ASYNC_LINK			9
222 #define	IBD_ASYNC_EXIT			10
223 
224 /*
225  * Async operation states
226  */
227 #define	IBD_OP_NOTSTARTED		0
228 #define	IBD_OP_ONGOING			1
229 #define	IBD_OP_COMPLETED		2
230 #define	IBD_OP_ERRORED			3
231 #define	IBD_OP_ROUTERED			4
232 
233 /*
234  * State of IBD driver initialization during attach/m_start
235  */
236 #define	IBD_DRV_STATE_INITIALIZED	0x00001
237 #define	IBD_DRV_RXINTR_ADDED		0x00002
238 #define	IBD_DRV_TXINTR_ADDED		0x00004
239 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
240 #define	IBD_DRV_HCA_OPENED		0x00010
241 #define	IBD_DRV_PD_ALLOCD		0x00020
242 #define	IBD_DRV_MAC_REGISTERED		0x00040
243 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
244 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
245 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
246 #define	IBD_DRV_CQS_ALLOCD		0x00400
247 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
248 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
249 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
250 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
251 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
252 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
253 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
254 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
255 #define	IBD_DRV_STARTED			0x80000
256 
257 /*
258  * Miscellaneous constants
259  */
260 #define	IBD_SEND			0
261 #define	IBD_RECV			1
262 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
263 #define	IBD_DEF_MAX_SDU			2044
264 #ifdef IBD_LOGGING
265 #define	IBD_DMAX_LINE			100
266 #endif
267 
268 /*
269  * Enumerations for link states
270  */
271 typedef enum {
272 	IBD_LINK_DOWN,
273 	IBD_LINK_UP,
274 	IBD_LINK_UP_ABSENT
275 } ibd_link_op_t;
276 
277 /*
278  * Driver State Pointer
279  */
280 void *ibd_list;
281 
282 /*
283  * Logging
284  */
285 #ifdef IBD_LOGGING
286 kmutex_t ibd_lbuf_lock;
287 uint8_t *ibd_lbuf;
288 uint32_t ibd_lbuf_ndx;
289 #endif
290 
291 /*
292  * Required system entry points
293  */
294 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
295 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
296 
297 /*
298  * Required driver entry points for GLDv3
299  */
300 static int ibd_m_stat(void *, uint_t, uint64_t *);
301 static int ibd_m_start(void *);
302 static void ibd_m_stop(void *);
303 static int ibd_m_promisc(void *, boolean_t);
304 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
305 static int ibd_m_unicst(void *, const uint8_t *);
306 static mblk_t *ibd_m_tx(void *, mblk_t *);
307 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
308 
309 /*
310  * Private driver entry points for GLDv3
311  */
312 
313 /*
314  * Initialization
315  */
316 static int ibd_state_init(ibd_state_t *, dev_info_t *);
317 static int ibd_init_txlist(ibd_state_t *);
318 static int ibd_init_rxlist(ibd_state_t *);
319 static int ibd_acache_init(ibd_state_t *);
320 #ifdef IBD_LOGGING
321 static void ibd_log_init(void);
322 #endif
323 
324 /*
325  * Termination/cleanup
326  */
327 static void ibd_state_fini(ibd_state_t *);
328 static void ibd_fini_txlist(ibd_state_t *);
329 static void ibd_fini_rxlist(ibd_state_t *);
330 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
331 static void ibd_acache_fini(ibd_state_t *);
332 #ifdef IBD_LOGGING
333 static void ibd_log_fini(void);
334 #endif
335 
336 /*
337  * Allocation/acquire/map routines
338  */
339 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
340 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
341 static int ibd_alloc_tx_copybufs(ibd_state_t *);
342 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
343 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
344 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
345     uint32_t *);
346 
347 /*
348  * Free/release/unmap routines
349  */
350 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
351 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
352 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
353 static void ibd_free_tx_copybufs(ibd_state_t *);
354 static void ibd_free_tx_lsobufs(ibd_state_t *);
355 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
356 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
357 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
358 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
359 
360 /*
361  * Handlers/callback routines
362  */
363 static uint_t ibd_intr(char *);
364 static uint_t ibd_tx_recycle(char *);
365 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
366 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
367 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
368 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
369 static void ibd_freemsg_cb(char *);
370 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
371     ibt_async_event_t *);
372 static void ibd_snet_notices_handler(void *, ib_gid_t,
373     ibt_subnet_event_code_t, ibt_subnet_event_t *);
374 
375 /*
376  * Send/receive routines
377  */
378 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
379 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
380 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
381 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
382 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
383 
384 /*
385  * Threads
386  */
387 static void ibd_async_work(ibd_state_t *);
388 
389 /*
390  * Async tasks
391  */
392 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
393 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
394 static void ibd_async_setprom(ibd_state_t *);
395 static void ibd_async_unsetprom(ibd_state_t *);
396 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
397 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
398 static void ibd_async_txsched(ibd_state_t *);
399 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
400 
401 /*
402  * Async task helpers
403  */
404 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
405 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
406 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
407 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
408     ipoib_mac_t *, ipoib_mac_t *);
409 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
410 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
411 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
412 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
413 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
414 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
415 static uint64_t ibd_get_portspeed(ibd_state_t *);
416 static boolean_t ibd_async_safe(ibd_state_t *);
417 static void ibd_async_done(ibd_state_t *);
418 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
419 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
420 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
421 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
422 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
423 
424 /*
425  * Helpers for attach/start routines
426  */
427 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
428 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
429 static int ibd_unattach(ibd_state_t *, dev_info_t *);
430 static int ibd_get_port_details(ibd_state_t *);
431 static int ibd_alloc_cqs(ibd_state_t *);
432 static int ibd_setup_ud_channel(ibd_state_t *);
433 static int ibd_undo_m_start(ibd_state_t *);
434 
435 
436 /*
437  * Miscellaneous helpers
438  */
439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
441 static int ibd_resume_transmission(ibd_state_t *);
442 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
443 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
444 static void *list_get_head(list_t *);
445 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
446 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
447 static void ibd_print_warn(ibd_state_t *, char *, ...);
448 #ifdef IBD_LOGGING
449 static void ibd_log(const char *, ...);
450 #endif
451 
452 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
453     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
454 
455 /* Module Driver Info */
456 static struct modldrv ibd_modldrv = {
457 	&mod_driverops,			/* This one is a driver */
458 	"InfiniBand GLDv3 Driver",	/* short description */
459 	&ibd_dev_ops			/* driver specific ops */
460 };
461 
462 /* Module Linkage */
463 static struct modlinkage ibd_modlinkage = {
464 	MODREV_1, (void *)&ibd_modldrv, NULL
465 };
466 
467 /*
468  * Module (static) info passed to IBTL during ibt_attach
469  */
470 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
471 	IBTI_V_CURR,
472 	IBT_NETWORK,
473 	ibd_async_handler,
474 	NULL,
475 	"IPIB"
476 };
477 
478 /*
479  * GLDv3 entry points
480  */
481 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
482 static mac_callbacks_t ibd_m_callbacks = {
483 	IBD_M_CALLBACK_FLAGS,
484 	ibd_m_stat,
485 	ibd_m_start,
486 	ibd_m_stop,
487 	ibd_m_promisc,
488 	ibd_m_multicst,
489 	ibd_m_unicst,
490 	ibd_m_tx,
491 	NULL,
492 	ibd_m_getcapab
493 };
494 
495 /*
496  * Fill/clear <scope> and <p_key> in multicast/broadcast address
497  */
498 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
499 {							\
500 	*(uint32_t *)((char *)(maddr) + 4) |=		\
501 	    htonl((uint32_t)(scope) << 16);		\
502 	*(uint32_t *)((char *)(maddr) + 8) |=		\
503 	    htonl((uint32_t)(pkey) << 16);		\
504 }
505 
506 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
507 {							\
508 	*(uint32_t *)((char *)(maddr) + 4) &=		\
509 	    htonl(~((uint32_t)0xF << 16));		\
510 	*(uint32_t *)((char *)(maddr) + 8) &=		\
511 	    htonl(~((uint32_t)0xFFFF << 16));		\
512 }
513 
514 /*
515  * Rudimentary debugging support
516  */
517 #ifdef DEBUG
518 int ibd_debuglevel = 100;
519 static void
520 debug_print(int l, char *fmt, ...)
521 {
522 	va_list ap;
523 
524 	if (l < ibd_debuglevel)
525 		return;
526 	va_start(ap, fmt);
527 	vcmn_err(CE_CONT, fmt, ap);
528 	va_end(ap);
529 }
530 #define	DPRINT		debug_print
531 #else
532 #define	DPRINT
533 #endif
534 
535 /*
536  * Common routine to print warning messages; adds in hca guid, port number
537  * and pkey to be able to identify the IBA interface.
538  */
539 static void
540 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
541 {
542 	ib_guid_t hca_guid;
543 	char ibd_print_buf[256];
544 	int len;
545 	va_list ap;
546 
547 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
548 	    0, "hca-guid", 0);
549 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
550 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
551 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
552 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
553 	va_start(ap, fmt);
554 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
555 	    fmt, ap);
556 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
557 	va_end(ap);
558 }
559 
560 /*
561  * Warlock directives
562  */
563 
564 /*
565  * id_lso_lock
566  *
567  * state->id_lso->bkt_nfree may be accessed without a lock to
568  * determine the threshold at which we have to ask the nw layer
569  * to resume transmission (see ibd_resume_transmission()).
570  */
571 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
572     ibd_state_t::id_lso))
573 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
574 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
575 
576 /*
577  * id_cq_poll_lock
578  */
579 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
580     ibd_state_t::id_cq_poll_busy))
581 
582 /*
583  * id_txpost_lock
584  */
585 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
586     ibd_state_t::id_tx_head))
587 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
588     ibd_state_t::id_tx_busy))
589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
590     ibd_state_t::id_tx_tailp))
591 
592 /*
593  * id_rxpost_lock
594  */
595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
596     ibd_state_t::id_rx_head))
597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
598     ibd_state_t::id_rx_busy))
599 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
600     ibd_state_t::id_rx_tailp))
601 
602 /*
603  * id_acache_req_lock
604  */
605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
606     ibd_state_t::id_acache_req_cv))
607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
608     ibd_state_t::id_req_list))
609 
610 /*
611  * id_ac_mutex
612  *
613  * This mutex is actually supposed to protect id_ah_op as well,
614  * but this path of the code isn't clean (see update of id_ah_op
615  * in ibd_async_acache(), immediately after the call to
616  * ibd_async_mcache()). For now, we'll skip this check by
617  * declaring that id_ah_op is protected by some internal scheme
618  * that warlock isn't aware of.
619  */
620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
621     ibd_state_t::id_ah_active))
622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
623     ibd_state_t::id_ah_free))
624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
625     ibd_state_t::id_ah_addr))
626 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
627     ibd_state_t::id_ah_op))
628 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
629     ibd_state_t::id_ah_error))
630 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
631 
632 /*
633  * id_mc_mutex
634  */
635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
636     ibd_state_t::id_mc_full))
637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
638     ibd_state_t::id_mc_non))
639 
640 /*
641  * id_trap_lock
642  */
643 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
644     ibd_state_t::id_trap_cv))
645 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
646     ibd_state_t::id_trap_stop))
647 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
648     ibd_state_t::id_trap_inprog))
649 
650 /*
651  * id_prom_op
652  */
653 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
654     ibd_state_t::id_prom_op))
655 
656 /*
657  * id_sched_lock
658  */
659 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
660     ibd_state_t::id_sched_needed))
661 
662 /*
663  * id_link_mutex
664  */
665 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
666     ibd_state_t::id_link_state))
667 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
668 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
669     ibd_state_t::id_link_speed))
670 
671 /*
672  * id_tx_list.dl_mutex
673  */
674 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
675     ibd_state_t::id_tx_list.dl_head))
676 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
677     ibd_state_t::id_tx_list.dl_tail))
678 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
679     ibd_state_t::id_tx_list.dl_pending_sends))
680 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
681     ibd_state_t::id_tx_list.dl_cnt))
682 
683 /*
684  * id_rx_list.dl_mutex
685  */
686 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
687     ibd_state_t::id_rx_list.dl_head))
688 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
689     ibd_state_t::id_rx_list.dl_tail))
690 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
691     ibd_state_t::id_rx_list.dl_bufs_outstanding))
692 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
693     ibd_state_t::id_rx_list.dl_cnt))
694 
695 
696 /*
697  * Items protected by atomic updates
698  */
699 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
700     ibd_state_s::id_brd_rcv
701     ibd_state_s::id_brd_xmt
702     ibd_state_s::id_multi_rcv
703     ibd_state_s::id_multi_xmt
704     ibd_state_s::id_num_intrs
705     ibd_state_s::id_rcv_bytes
706     ibd_state_s::id_rcv_pkt
707     ibd_state_s::id_tx_short
708     ibd_state_s::id_xmt_bytes
709     ibd_state_s::id_xmt_pkt))
710 
711 /*
712  * Non-mutex protection schemes for data elements. Almost all of
713  * these are non-shared items.
714  */
715 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
716     callb_cpr
717     ib_gid_s
718     ib_header_info
719     ibd_acache_rq
720     ibd_acache_s::ac_mce
721     ibd_mcache::mc_fullreap
722     ibd_mcache::mc_jstate
723     ibd_mcache::mc_req
724     ibd_rwqe_s
725     ibd_swqe_s
726     ibd_wqe_s
727     ibt_wr_ds_s::ds_va
728     ibt_wr_lso_s
729     ipoib_mac::ipoib_qpn
730     mac_capab_lso_s
731     msgb::b_next
732     msgb::b_rptr
733     msgb::b_wptr))
734 
735 int
736 _init()
737 {
738 	int status;
739 
740 	/*
741 	 * Sanity check some parameter settings. Tx completion polling
742 	 * only makes sense with separate CQs for Tx and Rx.
743 	 */
744 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
745 		cmn_err(CE_NOTE, "!ibd: %s",
746 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
747 		ibd_txcomp_poll = 0;
748 	}
749 
750 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
751 	if (status != 0) {
752 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
753 		return (status);
754 	}
755 
756 	mac_init_ops(&ibd_dev_ops, "ibd");
757 	status = mod_install(&ibd_modlinkage);
758 	if (status != 0) {
759 		DPRINT(10, "_init:failed in mod_install()");
760 		ddi_soft_state_fini(&ibd_list);
761 		mac_fini_ops(&ibd_dev_ops);
762 		return (status);
763 	}
764 
765 #ifdef IBD_LOGGING
766 	ibd_log_init();
767 #endif
768 	return (0);
769 }
770 
771 int
772 _info(struct modinfo *modinfop)
773 {
774 	return (mod_info(&ibd_modlinkage, modinfop));
775 }
776 
777 int
778 _fini()
779 {
780 	int status;
781 
782 	status = mod_remove(&ibd_modlinkage);
783 	if (status != 0)
784 		return (status);
785 
786 	mac_fini_ops(&ibd_dev_ops);
787 	ddi_soft_state_fini(&ibd_list);
788 #ifdef IBD_LOGGING
789 	ibd_log_fini();
790 #endif
791 	return (0);
792 }
793 
794 /*
795  * Convert the GID part of the mac address from network byte order
796  * to host order.
797  */
798 static void
799 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
800 {
801 	ib_sn_prefix_t nbopref;
802 	ib_guid_t nboguid;
803 
804 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
805 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
806 	dgid->gid_prefix = b2h64(nbopref);
807 	dgid->gid_guid = b2h64(nboguid);
808 }
809 
810 /*
811  * Create the IPoIB address in network byte order from host order inputs.
812  */
813 static void
814 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
815     ib_guid_t guid)
816 {
817 	ib_sn_prefix_t nbopref;
818 	ib_guid_t nboguid;
819 
820 	mac->ipoib_qpn = htonl(qpn);
821 	nbopref = h2b64(prefix);
822 	nboguid = h2b64(guid);
823 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
824 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
825 }
826 
827 /*
828  * Send to the appropriate all-routers group when the IBA multicast group
829  * does not exist, based on whether the target group is v4 or v6.
830  */
831 static boolean_t
832 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
833     ipoib_mac_t *rmac)
834 {
835 	boolean_t retval = B_TRUE;
836 	uint32_t adjscope = state->id_scope << 16;
837 	uint32_t topword;
838 
839 	/*
840 	 * Copy the first 4 bytes in without assuming any alignment of
841 	 * input mac address; this will have IPoIB signature, flags and
842 	 * scope bits.
843 	 */
844 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
845 	topword = ntohl(topword);
846 
847 	/*
848 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
849 	 */
850 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
851 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
852 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
853 		    ((uint32_t)(state->id_pkey << 16))),
854 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
855 	else
856 		/*
857 		 * Does not have proper bits in the mgid address.
858 		 */
859 		retval = B_FALSE;
860 
861 	return (retval);
862 }
863 
864 /*
865  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
866  * front of optional src/tgt link layer address. Right now Solaris inserts
867  * padding by default at the end. The routine which is doing is nce_xmit()
868  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
869  * the packet comes down from IP layer to the IBD driver, it is in the
870  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
871  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
872  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
873  *
874  * The send routine at IBD driver changes this packet as follows:
875  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
876  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
877  * aligned.
878  *
879  * At the receiving side again ibd_process_rx takes the above packet and
880  * removes the two bytes of front padding and inserts it at the end. This
881  * is since the IP layer does not understand padding at the front.
882  */
883 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
884 	uchar_t 	*nd_lla_ptr;					\
885 	icmp6_t 	*icmp6;						\
886 	nd_opt_hdr_t	*opt;						\
887 	int 		i;						\
888 									\
889 	icmp6 = (icmp6_t *)&ip6h[1];					\
890 	len -= sizeof (nd_neighbor_advert_t);				\
891 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
892 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
893 	    (len != 0)) {						\
894 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
895 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
896 		ASSERT(opt != NULL);					\
897 		nd_lla_ptr = (uchar_t *)&opt[1];			\
898 		if (type == IBD_SEND) {					\
899 			for (i = IPOIB_ADDRL; i > 0; i--)		\
900 				*(nd_lla_ptr + i + 1) =			\
901 				    *(nd_lla_ptr + i - 1);		\
902 		} else {						\
903 			for (i = 0; i < IPOIB_ADDRL; i++)		\
904 				*(nd_lla_ptr + i) =			\
905 				    *(nd_lla_ptr + i + 2);		\
906 		}							\
907 		*(nd_lla_ptr + i) = 0;					\
908 		*(nd_lla_ptr + i + 1) = 0;				\
909 	}								\
910 }
911 
912 /*
913  * Address handle entries maintained by the driver are kept in the
914  * free and active lists. Each entry starts out in the free list;
915  * it migrates to the active list when primed using ibt_get_paths()
916  * and ibt_modify_ud_dest() for transmission to a specific destination.
917  * In the active list, the entry has a reference count indicating the
918  * number of ongoing/uncompleted transmits that reference it. The
919  * entry is left in the active list even after the reference count
920  * goes to 0, since successive transmits can find it there and do
921  * not need to set up another entry (ie the path information is
922  * cached using the active list). Entries on the active list are
923  * also hashed using the destination link address as a key for faster
924  * lookups during transmits.
925  *
926  * For any destination address (unicast or multicast, whatever the
927  * join states), there will be at most one entry in the active list.
928  * Entries with a 0 reference count on the active list can be reused
929  * for a transmit to a new destination, if the free list is empty.
930  *
931  * The AH free list insertion/deletion is protected with the id_ac_mutex,
932  * since the async thread and Tx callback handlers insert/delete. The
933  * active list does not need a lock (all operations are done by the
934  * async thread) but updates to the reference count are atomically
935  * done (increments done by Tx path, decrements by the Tx callback handler).
936  */
937 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
938 	list_insert_head(&state->id_ah_free, ce)
939 #define	IBD_ACACHE_GET_FREE(state) \
940 	list_get_head(&state->id_ah_free)
941 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
942 	int _ret_;						\
943 	list_insert_head(&state->id_ah_active, ce);		\
944 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
945 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
946 	ASSERT(_ret_ == 0);					\
947 }
948 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
949 	list_remove(&state->id_ah_active, ce);			\
950 	(void) mod_hash_remove(state->id_ah_active_hash,	\
951 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
952 }
953 #define	IBD_ACACHE_GET_ACTIVE(state) \
954 	list_get_head(&state->id_ah_active)
955 
956 /*
957  * Membership states for different mcg's are tracked by two lists:
958  * the "non" list is used for promiscuous mode, when all mcg traffic
959  * needs to be inspected. This type of membership is never used for
960  * transmission, so there can not be an AH in the active list
961  * corresponding to a member in this list. This list does not need
962  * any protection, since all operations are performed by the async
963  * thread.
964  *
965  * "Full" and "SendOnly" membership is tracked using a single list,
966  * the "full" list. This is because this single list can then be
967  * searched during transmit to a multicast group (if an AH for the
968  * mcg is not found in the active list), since at least one type
969  * of membership must be present before initiating the transmit.
970  * This list is also emptied during driver detach, since sendonly
971  * membership acquired during transmit is dropped at detach time
972  * alongwith ipv4 broadcast full membership. Insert/deletes to
973  * this list are done only by the async thread, but it is also
974  * searched in program context (see multicast disable case), thus
975  * the id_mc_mutex protects the list. The driver detach path also
976  * deconstructs the "full" list, but it ensures that the async
977  * thread will not be accessing the list (by blocking out mcg
978  * trap handling and making sure no more Tx reaping will happen).
979  *
980  * Currently, an IBA attach is done in the SendOnly case too,
981  * although this is not required.
982  */
983 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
984 	list_insert_head(&state->id_mc_full, mce)
985 #define	IBD_MCACHE_INSERT_NON(state, mce) \
986 	list_insert_head(&state->id_mc_non, mce)
987 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
988 	ibd_mcache_find(mgid, &state->id_mc_full)
989 #define	IBD_MCACHE_FIND_NON(state, mgid) \
990 	ibd_mcache_find(mgid, &state->id_mc_non)
991 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
992 	list_remove(&state->id_mc_full, mce)
993 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
994 	list_remove(&state->id_mc_non, mce)
995 
996 /*
997  * AH and MCE active list manipulation:
998  *
999  * Multicast disable requests and MCG delete traps are two cases
1000  * where the active AH entry for the mcg (if any unreferenced one exists)
1001  * will be moved to the free list (to force the next Tx to the mcg to
1002  * join the MCG in SendOnly mode). Port up handling will also move AHs
1003  * from active to free list.
1004  *
1005  * In the case when some transmits are still pending on an entry
1006  * for an mcg, but a multicast disable has already been issued on the
1007  * mcg, there are some options to consider to preserve the join state
1008  * to ensure the emitted packet is properly routed on the IBA fabric.
1009  * For the AH, we can
1010  * 1. take out of active list at multicast disable time.
1011  * 2. take out of active list only when last pending Tx completes.
1012  * For the MCE, we can
1013  * 3. take out of active list at multicast disable time.
1014  * 4. take out of active list only when last pending Tx completes.
1015  * 5. move from active list to stale list at multicast disable time.
1016  * We choose to use 2,4. We use option 4 so that if a multicast enable
1017  * is tried before the pending Tx completes, the enable code finds the
1018  * mce in the active list and just has to make sure it will not be reaped
1019  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1020  * a stale list (#5) that would be checked in the enable code would need
1021  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1022  * after the multicast disable would try to put an AH in the active list,
1023  * and associate the mce it finds in the active list to this new AH,
1024  * whereas the mce is already associated with the previous AH (taken off
1025  * the active list), and will be removed once the pending Tx's complete
1026  * (unless a reference count on mce's is implemented). One implication of
1027  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1028  * grab new references on the AH, further delaying the leave.
1029  *
1030  * In the case of mcg delete (or create) trap when the port is sendonly
1031  * joined, the AH and MCE handling is different: the AH and MCE has to be
1032  * immediately taken off the active lists (forcing a join and path lookup
1033  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1034  * to an mcg as it is repeatedly created and deleted and goes thru
1035  * reincarnations).
1036  *
1037  * When a port is already sendonly joined, and a multicast enable is
1038  * attempted, the same mce structure is promoted; this ensures only a
1039  * single mce on the active list tracks the most powerful join state.
1040  *
1041  * In the case of port up event handling, the MCE for sendonly membership
1042  * is freed up, and the ACE is put into the free list as soon as possible
1043  * (depending on whether posted Tx's have completed). For fullmembership
1044  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1045  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1046  * done; else the mce is deconstructed (mc_fullreap case).
1047  *
1048  * MCG creation and deletion trap handling:
1049  *
1050  * These traps are unreliable (meaning sometimes the trap might never
1051  * be delivered to the subscribed nodes) and may arrive out-of-order
1052  * since they use UD transport. An alternative to relying on these
1053  * unreliable traps is to poll for mcg presence every so often, but
1054  * instead of doing that, we try to be as conservative as possible
1055  * while handling the traps, and hope that the traps do arrive at
1056  * the subscribed nodes soon. Note that if a node is fullmember
1057  * joined to an mcg, it can not possibly receive a mcg create/delete
1058  * trap for that mcg (by fullmember definition); if it does, it is
1059  * an old trap from a previous incarnation of the mcg.
1060  *
1061  * Whenever a trap is received, the driver cleans up its sendonly
1062  * membership to the group; we choose to do a sendonly leave even
1063  * on a creation trap to handle the case of a prior deletion of the mcg
1064  * having gone unnoticed. Consider an example scenario:
1065  * T1: MCG M is deleted, and fires off deletion trap D1.
1066  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1067  * T3: Node N tries to transmit to M, joining in sendonly mode.
1068  * T4: MCG M is deleted, and fires off deletion trap D2.
1069  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1070  *     If the trap is D2, then a LEAVE is not required, since the mcg
1071  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1072  *     approach is to always LEAVE, but the SM may be confused if it
1073  *     receives a LEAVE without a prior JOIN.
1074  *
1075  * Management of the non-membership to an mcg is similar to the above,
1076  * except that if the interface is in promiscuous mode, it is required
1077  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1078  * if the re-join attempt fails (in which case a warning message needs
1079  * to be printed), it is not clear whether it failed due to the mcg not
1080  * existing, or some fabric/hca issues, due to the delayed nature of
1081  * trap delivery. Querying the SA to establish presence/absence of the
1082  * mcg is also racy at best. Thus, the driver just prints a warning
1083  * message when it can not rejoin after receiving a create trap, although
1084  * this might be (on rare occassions) a mis-warning if the create trap is
1085  * received after the mcg was deleted.
1086  */
1087 
1088 /*
1089  * Implementation of atomic "recycle" bits and reference count
1090  * on address handles. This utilizes the fact that max reference
1091  * count on any handle is limited by number of send wqes, thus
1092  * high bits in the ac_ref field can be used as the recycle bits,
1093  * and only the low bits hold the number of pending Tx requests.
1094  * This atomic AH reference counting allows the Tx completion
1095  * handler not to acquire the id_ac_mutex to process every completion,
1096  * thus reducing lock contention problems between completion and
1097  * the Tx path.
1098  */
1099 #define	CYCLEVAL		0x80000
1100 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1101 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1102 #define	GET_REF(ace)		((ace)->ac_ref)
1103 #define	GET_REF_CYCLE(ace) (				\
1104 	/*						\
1105 	 * Make sure "cycle" bit is set.		\
1106 	 */						\
1107 	ASSERT(CYCLE_SET(ace)),				\
1108 	((ace)->ac_ref & ~(CYCLEVAL))			\
1109 )
1110 #define	INC_REF(ace, num) {				\
1111 	atomic_add_32(&(ace)->ac_ref, num);		\
1112 }
1113 #define	SET_CYCLE_IF_REF(ace) (				\
1114 	CYCLE_SET(ace) ? B_TRUE :			\
1115 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1116 		CYCLEVAL ?				\
1117 		/*					\
1118 		 * Clear the "cycle" bit we just set;	\
1119 		 * ref count known to be 0 from above.	\
1120 		 */					\
1121 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1122 		/*					\
1123 		 * We set "cycle" bit; let caller know.	\
1124 		 */					\
1125 		B_TRUE					\
1126 )
1127 #define	DEC_REF_DO_CYCLE(ace) (				\
1128 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1129 	    CYCLEVAL ?					\
1130 		/*					\
1131 		 * Ref count known to be 0 from above.	\
1132 		 */					\
1133 		B_TRUE :				\
1134 		B_FALSE					\
1135 )
1136 
1137 static void *
1138 list_get_head(list_t *list)
1139 {
1140 	list_node_t *lhead = list_head(list);
1141 
1142 	if (lhead != NULL)
1143 		list_remove(list, lhead);
1144 	return (lhead);
1145 }
1146 
1147 /*
1148  * This is always guaranteed to be able to queue the work.
1149  */
1150 static void
1151 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1152 {
1153 	/* Initialize request */
1154 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1155 	ptr->rq_op = op;
1156 
1157 	/*
1158 	 * Queue provided slot onto request pool.
1159 	 */
1160 	mutex_enter(&state->id_acache_req_lock);
1161 	list_insert_tail(&state->id_req_list, ptr);
1162 
1163 	/* Go, fetch, async thread */
1164 	cv_signal(&state->id_acache_req_cv);
1165 	mutex_exit(&state->id_acache_req_lock);
1166 }
1167 
1168 /*
1169  * Main body of the per interface async thread.
1170  */
1171 static void
1172 ibd_async_work(ibd_state_t *state)
1173 {
1174 	ibd_req_t *ptr;
1175 	callb_cpr_t cprinfo;
1176 
1177 	mutex_enter(&state->id_acache_req_lock);
1178 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1179 	    callb_generic_cpr, "ibd_async_work");
1180 
1181 	for (;;) {
1182 		ptr = list_get_head(&state->id_req_list);
1183 		if (ptr != NULL) {
1184 			mutex_exit(&state->id_acache_req_lock);
1185 
1186 			/*
1187 			 * Once we have done the operation, there is no
1188 			 * guarantee the request slot is going to be valid,
1189 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1190 			 * TRAP).
1191 			 *
1192 			 * Perform the request.
1193 			 */
1194 			switch (ptr->rq_op) {
1195 				case IBD_ASYNC_GETAH:
1196 					ibd_async_acache(state, &ptr->rq_mac);
1197 					break;
1198 				case IBD_ASYNC_JOIN:
1199 				case IBD_ASYNC_LEAVE:
1200 					ibd_async_multicast(state,
1201 					    ptr->rq_gid, ptr->rq_op);
1202 					break;
1203 				case IBD_ASYNC_PROMON:
1204 					ibd_async_setprom(state);
1205 					break;
1206 				case IBD_ASYNC_PROMOFF:
1207 					ibd_async_unsetprom(state);
1208 					break;
1209 				case IBD_ASYNC_REAP:
1210 					ibd_async_reap_group(state,
1211 					    ptr->rq_ptr, ptr->rq_gid,
1212 					    IB_MC_JSTATE_FULL);
1213 					/*
1214 					 * the req buf contains in mce
1215 					 * structure, so we do not need
1216 					 * to free it here.
1217 					 */
1218 					ptr = NULL;
1219 					break;
1220 				case IBD_ASYNC_TRAP:
1221 					ibd_async_trap(state, ptr);
1222 					break;
1223 				case IBD_ASYNC_SCHED:
1224 					ibd_async_txsched(state);
1225 					break;
1226 				case IBD_ASYNC_LINK:
1227 					ibd_async_link(state, ptr);
1228 					break;
1229 				case IBD_ASYNC_EXIT:
1230 					mutex_enter(&state->id_acache_req_lock);
1231 #ifndef __lock_lint
1232 					CALLB_CPR_EXIT(&cprinfo);
1233 #else
1234 					mutex_exit(&state->id_acache_req_lock);
1235 #endif
1236 					return;
1237 			}
1238 			if (ptr != NULL)
1239 				kmem_cache_free(state->id_req_kmc, ptr);
1240 
1241 			mutex_enter(&state->id_acache_req_lock);
1242 		} else {
1243 #ifndef __lock_lint
1244 			/*
1245 			 * Nothing to do: wait till new request arrives.
1246 			 */
1247 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1248 			cv_wait(&state->id_acache_req_cv,
1249 			    &state->id_acache_req_lock);
1250 			CALLB_CPR_SAFE_END(&cprinfo,
1251 			    &state->id_acache_req_lock);
1252 #endif
1253 		}
1254 	}
1255 
1256 	/*NOTREACHED*/
1257 	_NOTE(NOT_REACHED)
1258 }
1259 
1260 /*
1261  * Return when it is safe to queue requests to the async daemon; primarily
1262  * for subnet trap and async event handling. Disallow requests before the
1263  * daemon is created, and when interface deinitilization starts.
1264  */
1265 static boolean_t
1266 ibd_async_safe(ibd_state_t *state)
1267 {
1268 	mutex_enter(&state->id_trap_lock);
1269 	if (state->id_trap_stop) {
1270 		mutex_exit(&state->id_trap_lock);
1271 		return (B_FALSE);
1272 	}
1273 	state->id_trap_inprog++;
1274 	mutex_exit(&state->id_trap_lock);
1275 	return (B_TRUE);
1276 }
1277 
1278 /*
1279  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1280  * trap or event handling to complete to kill the async thread and deconstruct
1281  * the mcg/ace list.
1282  */
1283 static void
1284 ibd_async_done(ibd_state_t *state)
1285 {
1286 	mutex_enter(&state->id_trap_lock);
1287 	if (--state->id_trap_inprog == 0)
1288 		cv_signal(&state->id_trap_cv);
1289 	mutex_exit(&state->id_trap_lock);
1290 }
1291 
1292 /*
1293  * Hash functions:
1294  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1295  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1296  * These operate on mac addresses input into ibd_send, but there is no
1297  * guarantee on the alignment of the ipoib_mac_t structure.
1298  */
1299 /*ARGSUSED*/
1300 static uint_t
1301 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1302 {
1303 	ulong_t ptraddr = (ulong_t)key;
1304 	uint_t hval;
1305 
1306 	/*
1307 	 * If the input address is 4 byte aligned, we can just dereference
1308 	 * it. This is most common, since IP will send in a 4 byte aligned
1309 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1310 	 * 4 byte aligned too.
1311 	 */
1312 	if ((ptraddr & 3) == 0)
1313 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1314 
1315 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1316 	return (hval);
1317 }
1318 
1319 static int
1320 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1321 {
1322 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1323 		return (0);
1324 	else
1325 		return (1);
1326 }
1327 
1328 /*
1329  * Initialize all the per interface caches and lists; AH cache,
1330  * MCG list etc.
1331  */
1332 static int
1333 ibd_acache_init(ibd_state_t *state)
1334 {
1335 	ibd_ace_t *ce;
1336 	int i;
1337 
1338 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1339 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1340 
1341 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1342 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1343 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1344 	    offsetof(ibd_ace_t, ac_list));
1345 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1346 	    offsetof(ibd_ace_t, ac_list));
1347 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1348 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1349 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1350 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1351 	    offsetof(ibd_mce_t, mc_list));
1352 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1353 	    offsetof(ibd_mce_t, mc_list));
1354 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1355 	    offsetof(ibd_req_t, rq_list));
1356 
1357 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1358 	    IBD_NUM_AH, KM_SLEEP);
1359 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1360 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1361 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1362 			ibd_acache_fini(state);
1363 			return (DDI_FAILURE);
1364 		} else {
1365 			CLEAR_REFCYCLE(ce);
1366 			ce->ac_mce = NULL;
1367 			IBD_ACACHE_INSERT_FREE(state, ce);
1368 		}
1369 	}
1370 	return (DDI_SUCCESS);
1371 }
1372 
1373 static void
1374 ibd_acache_fini(ibd_state_t *state)
1375 {
1376 	ibd_ace_t *ptr;
1377 
1378 	mutex_enter(&state->id_ac_mutex);
1379 
1380 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1381 		ASSERT(GET_REF(ptr) == 0);
1382 		(void) ibt_free_ud_dest(ptr->ac_dest);
1383 	}
1384 
1385 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1386 		ASSERT(GET_REF(ptr) == 0);
1387 		(void) ibt_free_ud_dest(ptr->ac_dest);
1388 	}
1389 
1390 	list_destroy(&state->id_ah_free);
1391 	list_destroy(&state->id_ah_active);
1392 	list_destroy(&state->id_mc_full);
1393 	list_destroy(&state->id_mc_non);
1394 	list_destroy(&state->id_req_list);
1395 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1396 	mutex_exit(&state->id_ac_mutex);
1397 	mutex_destroy(&state->id_ac_mutex);
1398 	mutex_destroy(&state->id_mc_mutex);
1399 	mutex_destroy(&state->id_acache_req_lock);
1400 	cv_destroy(&state->id_acache_req_cv);
1401 }
1402 
1403 /*
1404  * Search AH active hash list for a cached path to input destination.
1405  * If we are "just looking", hold == F. When we are in the Tx path,
1406  * we set hold == T to grab a reference on the AH so that it can not
1407  * be recycled to a new destination while the Tx request is posted.
1408  */
1409 static ibd_ace_t *
1410 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1411 {
1412 	ibd_ace_t *ptr;
1413 
1414 	ASSERT(mutex_owned(&state->id_ac_mutex));
1415 
1416 	/*
1417 	 * Do hash search.
1418 	 */
1419 	if (mod_hash_find(state->id_ah_active_hash,
1420 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1421 		if (hold)
1422 			INC_REF(ptr, num);
1423 		return (ptr);
1424 	}
1425 	return (NULL);
1426 }
1427 
1428 /*
1429  * This is called by the tx side; if an initialized AH is found in
1430  * the active list, it is locked down and can be used; if no entry
1431  * is found, an async request is queued to do path resolution.
1432  */
1433 static ibd_ace_t *
1434 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1435 {
1436 	ibd_ace_t *ptr;
1437 	ibd_req_t *req;
1438 
1439 	/*
1440 	 * Only attempt to print when we can; in the mdt pattr case, the
1441 	 * address is not aligned properly.
1442 	 */
1443 	if (((ulong_t)mac & 3) == 0) {
1444 		DPRINT(4,
1445 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1446 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1447 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1448 		    htonl(mac->ipoib_gidsuff[1]));
1449 	}
1450 
1451 	mutex_enter(&state->id_ac_mutex);
1452 
1453 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1454 		mutex_exit(&state->id_ac_mutex);
1455 		return (ptr);
1456 	}
1457 
1458 	/*
1459 	 * Implementation of a single outstanding async request; if
1460 	 * the operation is not started yet, queue a request and move
1461 	 * to ongoing state. Remember in id_ah_addr for which address
1462 	 * we are queueing the request, in case we need to flag an error;
1463 	 * Any further requests, for the same or different address, until
1464 	 * the operation completes, is sent back to GLDv3 to be retried.
1465 	 * The async thread will update id_ah_op with an error indication
1466 	 * or will set it to indicate the next look up can start; either
1467 	 * way, it will mac_tx_update() so that all blocked requests come
1468 	 * back here.
1469 	 */
1470 	*err = EAGAIN;
1471 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1472 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1473 		if (req != NULL) {
1474 			/*
1475 			 * We did not even find the entry; queue a request
1476 			 * for it.
1477 			 */
1478 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1479 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1480 			state->id_ah_op = IBD_OP_ONGOING;
1481 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1482 		}
1483 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1484 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1485 		/*
1486 		 * Check the status of the pathrecord lookup request
1487 		 * we had queued before.
1488 		 */
1489 		if (state->id_ah_op == IBD_OP_ERRORED) {
1490 			*err = EFAULT;
1491 			state->id_ah_error++;
1492 		} else {
1493 			/*
1494 			 * IBD_OP_ROUTERED case: We need to send to the
1495 			 * all-router MCG. If we can find the AH for
1496 			 * the mcg, the Tx will be attempted. If we
1497 			 * do not find the AH, we return NORESOURCES
1498 			 * to retry.
1499 			 */
1500 			ipoib_mac_t routermac;
1501 
1502 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1503 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1504 			    numwqe);
1505 		}
1506 		state->id_ah_op = IBD_OP_NOTSTARTED;
1507 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1508 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1509 		/*
1510 		 * This case can happen when we get a higher band
1511 		 * packet. The easiest way is to reset the state machine
1512 		 * to accommodate the higher priority packet.
1513 		 */
1514 		state->id_ah_op = IBD_OP_NOTSTARTED;
1515 	}
1516 	mutex_exit(&state->id_ac_mutex);
1517 
1518 	return (ptr);
1519 }
1520 
1521 /*
1522  * Grab a not-currently-in-use AH/PathRecord from the active
1523  * list to recycle to a new destination. Only the async thread
1524  * executes this code.
1525  */
1526 static ibd_ace_t *
1527 ibd_acache_get_unref(ibd_state_t *state)
1528 {
1529 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1530 
1531 	ASSERT(mutex_owned(&state->id_ac_mutex));
1532 
1533 	/*
1534 	 * Do plain linear search.
1535 	 */
1536 	while (ptr != NULL) {
1537 		/*
1538 		 * Note that it is possible that the "cycle" bit
1539 		 * is set on the AH w/o any reference count. The
1540 		 * mcg must have been deleted, and the tx cleanup
1541 		 * just decremented the reference count to 0, but
1542 		 * hasn't gotten around to grabbing the id_ac_mutex
1543 		 * to move the AH into the free list.
1544 		 */
1545 		if (GET_REF(ptr) == 0) {
1546 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1547 			break;
1548 		}
1549 		ptr = list_next(&state->id_ah_active, ptr);
1550 	}
1551 	return (ptr);
1552 }
1553 
1554 /*
1555  * Invoked to clean up AH from active list in case of multicast
1556  * disable and to handle sendonly memberships during mcg traps.
1557  * And for port up processing for multicast and unicast AHs.
1558  * Normally, the AH is taken off the active list, and put into
1559  * the free list to be recycled for a new destination. In case
1560  * Tx requests on the AH have not completed yet, the AH is marked
1561  * for reaping (which will put the AH on the free list) once the Tx's
1562  * complete; in this case, depending on the "force" input, we take
1563  * out the AH from the active list right now, or leave it also for
1564  * the reap operation. Returns TRUE if the AH is taken off the active
1565  * list (and either put into the free list right now, or arranged for
1566  * later), FALSE otherwise.
1567  */
1568 static boolean_t
1569 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1570 {
1571 	ibd_ace_t *acactive;
1572 	boolean_t ret = B_TRUE;
1573 
1574 	ASSERT(mutex_owned(&state->id_ac_mutex));
1575 
1576 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1577 
1578 		/*
1579 		 * Note that the AH might already have the cycle bit set
1580 		 * on it; this might happen if sequences of multicast
1581 		 * enables and disables are coming so fast, that posted
1582 		 * Tx's to the mcg have not completed yet, and the cycle
1583 		 * bit is set successively by each multicast disable.
1584 		 */
1585 		if (SET_CYCLE_IF_REF(acactive)) {
1586 			if (!force) {
1587 				/*
1588 				 * The ace is kept on the active list, further
1589 				 * Tx's can still grab a reference on it; the
1590 				 * ace is reaped when all pending Tx's
1591 				 * referencing the AH complete.
1592 				 */
1593 				ret = B_FALSE;
1594 			} else {
1595 				/*
1596 				 * In the mcg trap case, we always pull the
1597 				 * AH from the active list. And also the port
1598 				 * up multi/unicast case.
1599 				 */
1600 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1601 				acactive->ac_mce = NULL;
1602 			}
1603 		} else {
1604 			/*
1605 			 * Determined the ref count is 0, thus reclaim
1606 			 * immediately after pulling out the ace from
1607 			 * the active list.
1608 			 */
1609 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1610 			acactive->ac_mce = NULL;
1611 			IBD_ACACHE_INSERT_FREE(state, acactive);
1612 		}
1613 
1614 	}
1615 	return (ret);
1616 }
1617 
1618 /*
1619  * Helper function for async path record lookup. If we are trying to
1620  * Tx to a MCG, check our membership, possibly trying to join the
1621  * group if required. If that fails, try to send the packet to the
1622  * all router group (indicated by the redirect output), pointing
1623  * the input mac address to the router mcg address.
1624  */
1625 static ibd_mce_t *
1626 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1627 {
1628 	ib_gid_t mgid;
1629 	ibd_mce_t *mce;
1630 	ipoib_mac_t routermac;
1631 
1632 	*redirect = B_FALSE;
1633 	ibd_n2h_gid(mac, &mgid);
1634 
1635 	/*
1636 	 * Check the FullMember+SendOnlyNonMember list.
1637 	 * Since we are the only one who manipulates the
1638 	 * id_mc_full list, no locks are needed.
1639 	 */
1640 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1641 	if (mce != NULL) {
1642 		DPRINT(4, "ibd_async_mcache : already joined to group");
1643 		return (mce);
1644 	}
1645 
1646 	/*
1647 	 * Not found; try to join(SendOnlyNonMember) and attach.
1648 	 */
1649 	DPRINT(4, "ibd_async_mcache : not joined to group");
1650 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1651 	    NULL) {
1652 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1653 		return (mce);
1654 	}
1655 
1656 	/*
1657 	 * MCGroup not present; try to join the all-router group. If
1658 	 * any of the following steps succeed, we will be redirecting
1659 	 * to the all router group.
1660 	 */
1661 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1662 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1663 		return (NULL);
1664 	*redirect = B_TRUE;
1665 	ibd_n2h_gid(&routermac, &mgid);
1666 	bcopy(&routermac, mac, IPOIB_ADDRL);
1667 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1668 	    mgid.gid_prefix, mgid.gid_guid);
1669 
1670 	/*
1671 	 * Are we already joined to the router group?
1672 	 */
1673 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1674 		DPRINT(4, "ibd_async_mcache : using already joined router"
1675 		    "group\n");
1676 		return (mce);
1677 	}
1678 
1679 	/*
1680 	 * Can we join(SendOnlyNonMember) the router group?
1681 	 */
1682 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1683 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1684 	    NULL) {
1685 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1686 		return (mce);
1687 	}
1688 
1689 	return (NULL);
1690 }
1691 
1692 /*
1693  * Async path record lookup code.
1694  */
1695 static void
1696 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1697 {
1698 	ibd_ace_t *ce;
1699 	ibd_mce_t *mce = NULL;
1700 	ibt_path_attr_t path_attr;
1701 	ibt_path_info_t path_info;
1702 	ib_gid_t destgid;
1703 	char ret = IBD_OP_NOTSTARTED;
1704 
1705 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1706 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1707 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1708 	    htonl(mac->ipoib_gidsuff[1]));
1709 
1710 	/*
1711 	 * Check whether we are trying to transmit to a MCG.
1712 	 * In that case, we need to make sure we are a member of
1713 	 * the MCG.
1714 	 */
1715 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1716 		boolean_t redirected;
1717 
1718 		/*
1719 		 * If we can not find or join the group or even
1720 		 * redirect, error out.
1721 		 */
1722 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1723 		    NULL) {
1724 			state->id_ah_op = IBD_OP_ERRORED;
1725 			return;
1726 		}
1727 
1728 		/*
1729 		 * If we got redirected, we need to determine whether
1730 		 * the AH for the new mcg is in the cache already, and
1731 		 * not pull it in then; otherwise proceed to get the
1732 		 * path for the new mcg. There is no guarantee that
1733 		 * if the AH is currently in the cache, it will still be
1734 		 * there when we look in ibd_acache_lookup(), but that's
1735 		 * okay, we will come back here.
1736 		 */
1737 		if (redirected) {
1738 			ret = IBD_OP_ROUTERED;
1739 			DPRINT(4, "ibd_async_acache :  redirected to "
1740 			    "%08X:%08X:%08X:%08X:%08X",
1741 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1742 			    htonl(mac->ipoib_gidpref[1]),
1743 			    htonl(mac->ipoib_gidsuff[0]),
1744 			    htonl(mac->ipoib_gidsuff[1]));
1745 
1746 			mutex_enter(&state->id_ac_mutex);
1747 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1748 				state->id_ah_op = IBD_OP_ROUTERED;
1749 				mutex_exit(&state->id_ac_mutex);
1750 				DPRINT(4, "ibd_async_acache : router AH found");
1751 				return;
1752 			}
1753 			mutex_exit(&state->id_ac_mutex);
1754 		}
1755 	}
1756 
1757 	/*
1758 	 * Get an AH from the free list.
1759 	 */
1760 	mutex_enter(&state->id_ac_mutex);
1761 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1762 		/*
1763 		 * No free ones; try to grab an unreferenced active
1764 		 * one. Maybe we need to make the active list LRU,
1765 		 * but that will create more work for Tx callbacks.
1766 		 * Is there a way of not having to pull out the
1767 		 * entry from the active list, but just indicate it
1768 		 * is being recycled? Yes, but that creates one more
1769 		 * check in the fast lookup path.
1770 		 */
1771 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1772 			/*
1773 			 * Pretty serious shortage now.
1774 			 */
1775 			state->id_ah_op = IBD_OP_NOTSTARTED;
1776 			mutex_exit(&state->id_ac_mutex);
1777 			DPRINT(10, "ibd_async_acache : failed to find AH "
1778 			    "slot\n");
1779 			return;
1780 		}
1781 		/*
1782 		 * We could check whether ac_mce points to a SendOnly
1783 		 * member and drop that membership now. Or do it lazily
1784 		 * at detach time.
1785 		 */
1786 		ce->ac_mce = NULL;
1787 	}
1788 	mutex_exit(&state->id_ac_mutex);
1789 	ASSERT(ce->ac_mce == NULL);
1790 
1791 	/*
1792 	 * Update the entry.
1793 	 */
1794 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1795 
1796 	bzero(&path_info, sizeof (path_info));
1797 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1798 	path_attr.pa_sgid = state->id_sgid;
1799 	path_attr.pa_num_dgids = 1;
1800 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1801 	path_attr.pa_dgids = &destgid;
1802 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1803 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1804 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1805 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1806 		goto error;
1807 	}
1808 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1809 	    ntohl(ce->ac_mac.ipoib_qpn),
1810 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1811 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1812 		goto error;
1813 	}
1814 
1815 	/*
1816 	 * mce is set whenever an AH is being associated with a
1817 	 * MCG; this will come in handy when we leave the MCG. The
1818 	 * lock protects Tx fastpath from scanning the active list.
1819 	 */
1820 	if (mce != NULL)
1821 		ce->ac_mce = mce;
1822 	mutex_enter(&state->id_ac_mutex);
1823 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1824 	state->id_ah_op = ret;
1825 	mutex_exit(&state->id_ac_mutex);
1826 	return;
1827 error:
1828 	/*
1829 	 * We might want to drop SendOnly membership here if we
1830 	 * joined above. The lock protects Tx callbacks inserting
1831 	 * into the free list.
1832 	 */
1833 	mutex_enter(&state->id_ac_mutex);
1834 	state->id_ah_op = IBD_OP_ERRORED;
1835 	IBD_ACACHE_INSERT_FREE(state, ce);
1836 	mutex_exit(&state->id_ac_mutex);
1837 }
1838 
1839 /*
1840  * While restoring port's presence on the subnet on a port up, it is possible
1841  * that the port goes down again.
1842  */
1843 static void
1844 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1845 {
1846 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1847 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1848 	    LINK_STATE_UP;
1849 	ibd_mce_t *mce, *pmce;
1850 	ibd_ace_t *ace, *pace;
1851 
1852 	DPRINT(10, "ibd_async_link(): %d", opcode);
1853 
1854 	/*
1855 	 * On a link up, revalidate the link speed/width. No point doing
1856 	 * this on a link down, since we will be unable to do SA operations,
1857 	 * defaulting to the lowest speed. Also notice that we update our
1858 	 * notion of speed before calling mac_link_update(), which will do
1859 	 * neccesary higher level notifications for speed changes.
1860 	 */
1861 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1862 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1863 		state->id_link_speed = ibd_get_portspeed(state);
1864 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1865 	}
1866 
1867 	/*
1868 	 * Do all the work required to establish our presence on
1869 	 * the subnet.
1870 	 */
1871 	if (opcode == IBD_LINK_UP_ABSENT) {
1872 		/*
1873 		 * If in promiscuous mode ...
1874 		 */
1875 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1876 			/*
1877 			 * Drop all nonmembership.
1878 			 */
1879 			ibd_async_unsetprom(state);
1880 
1881 			/*
1882 			 * Then, try to regain nonmembership to all mcg's.
1883 			 */
1884 			ibd_async_setprom(state);
1885 
1886 		}
1887 
1888 		/*
1889 		 * Drop all sendonly membership (which also gets rid of the
1890 		 * AHs); try to reacquire all full membership.
1891 		 */
1892 		mce = list_head(&state->id_mc_full);
1893 		while ((pmce = mce) != NULL) {
1894 			mce = list_next(&state->id_mc_full, mce);
1895 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1896 				ibd_leave_group(state,
1897 				    pmce->mc_info.mc_adds_vect.av_dgid,
1898 				    IB_MC_JSTATE_SEND_ONLY_NON);
1899 			else
1900 				ibd_reacquire_group(state, pmce);
1901 		}
1902 
1903 		/*
1904 		 * Recycle all active AHs to free list (and if there are
1905 		 * pending posts, make sure they will go into the free list
1906 		 * once the Tx's complete). Grab the lock to prevent
1907 		 * concurrent Tx's as well as Tx cleanups.
1908 		 */
1909 		mutex_enter(&state->id_ac_mutex);
1910 		ace = list_head(&state->id_ah_active);
1911 		while ((pace = ace) != NULL) {
1912 			boolean_t cycled;
1913 
1914 			ace = list_next(&state->id_ah_active, ace);
1915 			mce = pace->ac_mce;
1916 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1917 			    B_TRUE);
1918 			/*
1919 			 * If this is for an mcg, it must be for a fullmember,
1920 			 * since we got rid of send-only members above when
1921 			 * processing the mce list.
1922 			 */
1923 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1924 			    IB_MC_JSTATE_FULL)));
1925 
1926 			/*
1927 			 * Check if the fullmember mce needs to be torn down,
1928 			 * ie whether the DLPI disable has already been done.
1929 			 * If so, do some of the work of tx_cleanup, namely
1930 			 * causing leave (which will fail), detach and
1931 			 * mce-freeing. tx_cleanup will put the AH into free
1932 			 * list. The reason to duplicate some of this
1933 			 * tx_cleanup work is because we want to delete the
1934 			 * AH right now instead of waiting for tx_cleanup, to
1935 			 * force subsequent Tx's to reacquire an AH.
1936 			 */
1937 			if ((mce != NULL) && (mce->mc_fullreap))
1938 				ibd_async_reap_group(state, mce,
1939 				    mce->mc_info.mc_adds_vect.av_dgid,
1940 				    mce->mc_jstate);
1941 		}
1942 		mutex_exit(&state->id_ac_mutex);
1943 	}
1944 
1945 	/*
1946 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1947 	 * (which stops further events from being delivered) before
1948 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1949 	 * has already been done.
1950 	 */
1951 	mutex_enter(&state->id_link_mutex);
1952 	state->id_link_state = lstate;
1953 	mac_link_update(state->id_mh, lstate);
1954 	mutex_exit(&state->id_link_mutex);
1955 
1956 	ibd_async_done(state);
1957 }
1958 
1959 /*
1960  * When the link is notified up, we need to do a few things, based
1961  * on the port's current p_init_type_reply claiming a reinit has been
1962  * done or not. The reinit steps are:
1963  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1964  *    the old Pkey and GID0 are correct.
1965  * 2. Register for mcg traps (already done by ibmf).
1966  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1967  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1968  * 4. Give up all sendonly memberships.
1969  * 5. Acquire all full memberships.
1970  * 6. In promiscuous mode, acquire all non memberships.
1971  * 7. Recycle all AHs to free list.
1972  */
1973 static void
1974 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1975 {
1976 	ibt_hca_portinfo_t *port_infop;
1977 	ibt_status_t ibt_status;
1978 	uint_t psize, port_infosz;
1979 	ibd_link_op_t opcode;
1980 	ibd_req_t *req;
1981 
1982 	/*
1983 	 * Do not send a request to the async daemon if it has not
1984 	 * yet been created or is being destroyed. If the async
1985 	 * daemon has not yet been created, we still need to track
1986 	 * last known state of the link. If this code races with the
1987 	 * detach path, then we are assured that the detach path has
1988 	 * not yet done the ibt_close_hca (which waits for all async
1989 	 * events to complete). If the code races with the attach path,
1990 	 * we need to validate the pkey/gid (in the link_up case) if
1991 	 * the initialization path has already set these up and created
1992 	 * IBTF resources based on the values.
1993 	 */
1994 	mutex_enter(&state->id_link_mutex);
1995 
1996 	/*
1997 	 * If the init code in ibd_m_start hasn't yet set up the
1998 	 * pkey/gid, nothing to do; that code will set the link state.
1999 	 */
2000 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2001 		mutex_exit(&state->id_link_mutex);
2002 		return;
2003 	}
2004 
2005 	if ((code == IBT_EVENT_PORT_UP) || (code == IBT_CLNT_REREG_EVENT) ||
2006 	    (code == IBT_PORT_CHANGE_EVENT)) {
2007 		uint8_t itreply;
2008 		boolean_t badup = B_FALSE;
2009 
2010 		ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
2011 		    state->id_port, &port_infop, &psize, &port_infosz);
2012 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
2013 			mutex_exit(&state->id_link_mutex);
2014 			DPRINT(10, "ibd_link_up : failed in"
2015 			    " ibt_query_port()\n");
2016 			return;
2017 		}
2018 
2019 		/*
2020 		 * If the link already went down by the time the handler gets
2021 		 * here, give up; we can not even validate pkey/gid since those
2022 		 * are not valid.
2023 		 */
2024 		if (port_infop->p_linkstate != IBT_PORT_ACTIVE)
2025 			badup = B_TRUE;
2026 
2027 		itreply = port_infop->p_init_type_reply;
2028 
2029 		/*
2030 		 * In InitTypeReply, check if NoLoadReply ==
2031 		 * PreserveContentReply == 0, in which case, verify Pkey/GID0.
2032 		 */
2033 		if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2034 		    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) &&
2035 		    (!badup)) {
2036 			/*
2037 			 * Check that the subnet part of GID0 has not changed.
2038 			 */
2039 			if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid,
2040 			    sizeof (ib_gid_t)) != 0)
2041 				badup = B_TRUE;
2042 
2043 			/*
2044 			 * Check that Pkey/index mapping is still valid.
2045 			 */
2046 			if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) ||
2047 			    (port_infop->p_pkey_tbl[state->id_pkix] !=
2048 			    state->id_pkey))
2049 				badup = B_TRUE;
2050 		}
2051 
2052 		/*
2053 		 * In InitTypeReply, if PreservePresenceReply indicates the SM
2054 		 * has ensured that the port's presence in mcg, traps etc is
2055 		 * intact, nothing more to do.
2056 		 */
2057 		opcode = IBD_LINK_UP_ABSENT;
2058 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2059 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY)
2060 			opcode = IBD_LINK_UP;
2061 
2062 		ibt_free_portinfo(port_infop, port_infosz);
2063 
2064 		if (badup) {
2065 			code = IBT_ERROR_PORT_DOWN;
2066 		} else if (code == IBT_PORT_CHANGE_EVENT) {
2067 			mutex_exit(&state->id_link_mutex);
2068 			return;
2069 		}
2070 	}
2071 
2072 	if (!ibd_async_safe(state)) {
2073 		state->id_link_state = (((code == IBT_EVENT_PORT_UP) ||
2074 		    (code == IBT_CLNT_REREG_EVENT)) ?  LINK_STATE_UP :
2075 		    LINK_STATE_DOWN);
2076 		mutex_exit(&state->id_link_mutex);
2077 		return;
2078 	}
2079 	mutex_exit(&state->id_link_mutex);
2080 
2081 	if (code == IBT_ERROR_PORT_DOWN)
2082 		opcode = IBD_LINK_DOWN;
2083 
2084 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2085 	req->rq_ptr = (void *)opcode;
2086 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2087 }
2088 
2089 /*
2090  * For the port up/down events, IBTL guarantees there will not be concurrent
2091  * invocations of the handler. IBTL might coalesce link transition events,
2092  * and not invoke the handler for _each_ up/down transition, but it will
2093  * invoke the handler with last known state
2094  */
2095 static void
2096 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2097     ibt_async_code_t code, ibt_async_event_t *event)
2098 {
2099 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2100 
2101 	switch (code) {
2102 	case IBT_ERROR_CATASTROPHIC_CHAN:
2103 		ibd_print_warn(state, "catastrophic channel error");
2104 		break;
2105 	case IBT_ERROR_CQ:
2106 		ibd_print_warn(state, "completion queue error");
2107 		break;
2108 	case IBT_PORT_CHANGE_EVENT:
2109 		/*
2110 		 * Events will be delivered to all instances that have
2111 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2112 		 * Only need to do work for our port; IBTF will deliver
2113 		 * events for other ports on the hca we have ibt_open_hca'ed
2114 		 * too. Note that id_port is initialized in ibd_attach()
2115 		 * before we do an ibt_open_hca() in ibd_attach().
2116 		 */
2117 		ASSERT(state->id_hca_hdl == hca_hdl);
2118 		if (state->id_port != event->ev_port)
2119 			break;
2120 
2121 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2122 		    IBT_PORT_CHANGE_PKEY) {
2123 			ibd_link_mod(state, code);
2124 		}
2125 		break;
2126 	case IBT_ERROR_PORT_DOWN:
2127 	case IBT_CLNT_REREG_EVENT:
2128 	case IBT_EVENT_PORT_UP:
2129 		/*
2130 		 * Events will be delivered to all instances that have
2131 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2132 		 * Only need to do work for our port; IBTF will deliver
2133 		 * events for other ports on the hca we have ibt_open_hca'ed
2134 		 * too. Note that id_port is initialized in ibd_attach()
2135 		 * before we do an ibt_open_hca() in ibd_attach().
2136 		 */
2137 		ASSERT(state->id_hca_hdl == hca_hdl);
2138 		if (state->id_port != event->ev_port)
2139 			break;
2140 
2141 		ibd_link_mod(state, code);
2142 		break;
2143 
2144 	case IBT_HCA_ATTACH_EVENT:
2145 	case IBT_HCA_DETACH_EVENT:
2146 		/*
2147 		 * When a new card is plugged to the system, attach_event is
2148 		 * invoked. Additionally, a cfgadm needs to be run to make the
2149 		 * card known to the system, and an ifconfig needs to be run to
2150 		 * plumb up any ibd interfaces on the card. In the case of card
2151 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2152 		 * unplumb the ibd interfaces on the card; when the card is
2153 		 * actually unplugged, the detach_event is invoked;
2154 		 * additionally, if any ibd instances are still active on the
2155 		 * card (eg there were no associated RCM scripts), driver's
2156 		 * detach routine is invoked.
2157 		 */
2158 		break;
2159 	default:
2160 		break;
2161 	}
2162 }
2163 
2164 static int
2165 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2166 {
2167 	mac_register_t *macp;
2168 	int ret;
2169 
2170 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2171 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2172 		return (DDI_FAILURE);
2173 	}
2174 
2175 	/*
2176 	 * Note that when we register with mac during attach, we don't
2177 	 * have the id_macaddr yet, so we'll simply be registering a
2178 	 * zero macaddr that we'll overwrite later during plumb (in
2179 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2180 	 * update the mac layer with the correct mtu during plumb.
2181 	 */
2182 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2183 	macp->m_driver = state;
2184 	macp->m_dip = dip;
2185 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2186 	macp->m_callbacks = &ibd_m_callbacks;
2187 	macp->m_min_sdu = 0;
2188 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2189 
2190 	/*
2191 	 *  Register ourselves with the GLDv3 interface
2192 	 */
2193 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2194 		mac_free(macp);
2195 		DPRINT(10,
2196 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2197 		return (DDI_FAILURE);
2198 	}
2199 
2200 	mac_free(macp);
2201 	return (DDI_SUCCESS);
2202 }
2203 
2204 static int
2205 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2206 {
2207 	ibt_hca_attr_t hca_attrs;
2208 	ibt_status_t ibt_status;
2209 
2210 	/*
2211 	 * Query the HCA and fetch its attributes
2212 	 */
2213 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2214 	ASSERT(ibt_status == IBT_SUCCESS);
2215 
2216 	/*
2217 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2218 	 *    full checksum offload.
2219 	 */
2220 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2221 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2222 	}
2223 
2224 	/*
2225 	 * 2. Set LSO policy, capability and maximum length
2226 	 */
2227 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2228 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2229 		state->id_lso_policy = B_TRUE;
2230 	} else {
2231 		state->id_lso_policy = B_FALSE;
2232 	}
2233 	if (hca_attrs.hca_max_lso_size > 0) {
2234 		state->id_lso_capable = B_TRUE;
2235 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2236 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2237 		else
2238 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2239 	} else {
2240 		state->id_lso_capable = B_FALSE;
2241 		state->id_lso_maxlen = 0;
2242 	}
2243 
2244 	/*
2245 	 * 3. Set Reserved L_Key capability
2246 	 */
2247 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2248 		state->id_hca_res_lkey_capab = 1;
2249 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2250 	}
2251 
2252 	/*
2253 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2254 	 *    size information is provided by the hca
2255 	 */
2256 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2257 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2258 	} else {
2259 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2260 	}
2261 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2262 		state->id_max_sqseg = IBD_MAX_SQSEG;
2263 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2264 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2265 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2266 	}
2267 
2268 	/*
2269 	 * 5. Set number of recv and send wqes after checking hca maximum
2270 	 *    channel size
2271 	 */
2272 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2273 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2274 	} else {
2275 		state->id_num_rwqe = IBD_NUM_RWQE;
2276 	}
2277 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2278 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2279 	} else {
2280 		state->id_num_swqe = IBD_NUM_SWQE;
2281 	}
2282 
2283 	return (DDI_SUCCESS);
2284 }
2285 
2286 static int
2287 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2288 {
2289 	int instance;
2290 	uint32_t progress = state->id_mac_state;
2291 	ibt_status_t ret;
2292 
2293 	if (progress & IBD_DRV_MAC_REGISTERED) {
2294 		(void) mac_unregister(state->id_mh);
2295 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2296 	}
2297 
2298 	if (progress & IBD_DRV_PD_ALLOCD) {
2299 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2300 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2301 			ibd_print_warn(state, "failed to free "
2302 			    "protection domain, ret=%d", ret);
2303 		}
2304 		state->id_pd_hdl = NULL;
2305 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2306 	}
2307 
2308 	if (progress & IBD_DRV_HCA_OPENED) {
2309 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2310 		    IBT_SUCCESS) {
2311 			ibd_print_warn(state, "failed to close "
2312 			    "HCA device, ret=%d", ret);
2313 		}
2314 		state->id_hca_hdl = NULL;
2315 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2316 	}
2317 
2318 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2319 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2320 			ibd_print_warn(state,
2321 			    "ibt_detach() failed, ret=%d", ret);
2322 		}
2323 		state->id_ibt_hdl = NULL;
2324 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2325 	}
2326 
2327 	if (progress & IBD_DRV_TXINTR_ADDED) {
2328 		ddi_remove_softintr(state->id_tx);
2329 		state->id_tx = NULL;
2330 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2331 	}
2332 
2333 	if (progress & IBD_DRV_RXINTR_ADDED) {
2334 		ddi_remove_softintr(state->id_rx);
2335 		state->id_rx = NULL;
2336 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2337 	}
2338 
2339 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2340 		ibd_state_fini(state);
2341 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2342 	}
2343 
2344 	instance = ddi_get_instance(dip);
2345 	ddi_soft_state_free(ibd_list, instance);
2346 
2347 	return (DDI_SUCCESS);
2348 }
2349 
2350 /*
2351  * Attach device to the IO framework.
2352  */
2353 static int
2354 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2355 {
2356 	ibd_state_t *state = NULL;
2357 	ib_guid_t hca_guid;
2358 	int instance;
2359 	ibt_status_t ret;
2360 	int rv;
2361 
2362 	/*
2363 	 * IBD doesn't support suspend/resume
2364 	 */
2365 	if (cmd != DDI_ATTACH)
2366 		return (DDI_FAILURE);
2367 
2368 	/*
2369 	 * Allocate softstate structure
2370 	 */
2371 	instance = ddi_get_instance(dip);
2372 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2373 		return (DDI_FAILURE);
2374 	state = ddi_get_soft_state(ibd_list, instance);
2375 
2376 	/*
2377 	 * Initialize mutexes and condition variables
2378 	 */
2379 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2380 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2381 		goto attach_fail;
2382 	}
2383 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2384 
2385 	/*
2386 	 * Allocate rx,tx softintr
2387 	 */
2388 	if (ibd_rx_softintr == 1) {
2389 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2390 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2391 			DPRINT(10, "ibd_attach: failed in "
2392 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2393 			goto attach_fail;
2394 		}
2395 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2396 	}
2397 	if (ibd_tx_softintr == 1) {
2398 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2399 		    NULL, NULL, ibd_tx_recycle,
2400 		    (caddr_t)state)) != DDI_SUCCESS) {
2401 			DPRINT(10, "ibd_attach: failed in "
2402 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2403 			goto attach_fail;
2404 		}
2405 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2406 	}
2407 
2408 	/*
2409 	 * Obtain IBA P_Key, port number and HCA guid and validate
2410 	 * them (for P_Key, only full members are allowed as per
2411 	 * IPoIB specification; neither port number nor HCA guid
2412 	 * can be zero)
2413 	 */
2414 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2415 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2416 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2417 		    state->id_pkey);
2418 		goto attach_fail;
2419 	}
2420 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2421 	    "port-number", 0)) == 0) {
2422 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2423 		    state->id_port);
2424 		goto attach_fail;
2425 	}
2426 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2427 	    "hca-guid", 0)) == 0) {
2428 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2429 		    hca_guid);
2430 		goto attach_fail;
2431 	}
2432 
2433 	/*
2434 	 * Attach to IBTL
2435 	 */
2436 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2437 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2438 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2439 		goto attach_fail;
2440 	}
2441 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2442 
2443 	/*
2444 	 * Open the HCA
2445 	 */
2446 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2447 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2448 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2449 		goto attach_fail;
2450 	}
2451 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2452 
2453 	/*
2454 	 * Record capabilities
2455 	 */
2456 	(void) ibd_record_capab(state, dip);
2457 
2458 	/*
2459 	 * Allocate a protection domain on the HCA
2460 	 */
2461 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2462 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2463 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2464 		goto attach_fail;
2465 	}
2466 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2467 
2468 
2469 	/*
2470 	 * Register ibd interfaces with the Nemo framework
2471 	 */
2472 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2473 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2474 		goto attach_fail;
2475 	}
2476 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2477 
2478 	/*
2479 	 * We're done with everything we could to make the attach
2480 	 * succeed.  All the buffer allocations and IPoIB broadcast
2481 	 * group joins are deferred to when the interface instance
2482 	 * is actually plumbed to avoid wasting memory.
2483 	 */
2484 	return (DDI_SUCCESS);
2485 
2486 attach_fail:
2487 	ibd_unattach(state, dip);
2488 	return (DDI_FAILURE);
2489 }
2490 
2491 /*
2492  * Detach device from the IO framework.
2493  */
2494 static int
2495 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2496 {
2497 	ibd_state_t *state;
2498 	int instance;
2499 
2500 	/*
2501 	 * IBD doesn't support suspend/resume
2502 	 */
2503 	if (cmd != DDI_DETACH)
2504 		return (DDI_FAILURE);
2505 
2506 	/*
2507 	 * Get the instance softstate
2508 	 */
2509 	instance = ddi_get_instance(dip);
2510 	state = ddi_get_soft_state(ibd_list, instance);
2511 
2512 	/*
2513 	 * Release all resources we're holding still.  Note that if we'd
2514 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2515 	 * so far, we should find all the flags we need in id_mac_state.
2516 	 */
2517 	(void) ibd_unattach(state, dip);
2518 
2519 	return (DDI_SUCCESS);
2520 }
2521 
2522 /*
2523  * Pre ibt_attach() driver initialization
2524  */
2525 static int
2526 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2527 {
2528 	char buf[64];
2529 
2530 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2531 	state->id_link_state = LINK_STATE_UNKNOWN;
2532 
2533 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2534 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2535 	state->id_trap_stop = B_TRUE;
2536 	state->id_trap_inprog = 0;
2537 
2538 	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2539 	state->id_dip = dip;
2540 
2541 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2542 
2543 	state->id_tx_list.dl_head = NULL;
2544 	state->id_tx_list.dl_tail = NULL;
2545 	state->id_tx_list.dl_pending_sends = B_FALSE;
2546 	state->id_tx_list.dl_cnt = 0;
2547 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2548 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2549 	state->id_tx_busy = 0;
2550 
2551 	state->id_rx_list.dl_head = NULL;
2552 	state->id_rx_list.dl_tail = NULL;
2553 	state->id_rx_list.dl_bufs_outstanding = 0;
2554 	state->id_rx_list.dl_cnt = 0;
2555 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2556 	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
2557 
2558 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2559 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2560 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2561 
2562 #ifdef IBD_LOGGING
2563 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
2564 #endif
2565 
2566 	return (DDI_SUCCESS);
2567 }
2568 
2569 /*
2570  * Post ibt_detach() driver deconstruction
2571  */
2572 static void
2573 ibd_state_fini(ibd_state_t *state)
2574 {
2575 	kmem_cache_destroy(state->id_req_kmc);
2576 
2577 	mutex_destroy(&state->id_rxpost_lock);
2578 	mutex_destroy(&state->id_rx_list.dl_mutex);
2579 
2580 	mutex_destroy(&state->id_txpost_lock);
2581 	mutex_destroy(&state->id_tx_list.dl_mutex);
2582 
2583 	mutex_destroy(&state->id_sched_lock);
2584 	mutex_destroy(&state->id_cq_poll_lock);
2585 
2586 	cv_destroy(&state->id_trap_cv);
2587 	mutex_destroy(&state->id_trap_lock);
2588 	mutex_destroy(&state->id_link_mutex);
2589 
2590 #ifdef IBD_LOGGING
2591 	mutex_destroy(&ibd_lbuf_lock);
2592 #endif
2593 }
2594 
2595 /*
2596  * Fetch link speed from SA for snmp ifspeed reporting.
2597  */
2598 static uint64_t
2599 ibd_get_portspeed(ibd_state_t *state)
2600 {
2601 	int			ret;
2602 	ibt_path_info_t		path;
2603 	ibt_path_attr_t		path_attr;
2604 	uint8_t			num_paths;
2605 	uint64_t		ifspeed;
2606 
2607 	/*
2608 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2609 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2610 	 * 2000000000. Start with that as default.
2611 	 */
2612 	ifspeed = 2000000000;
2613 
2614 	bzero(&path_attr, sizeof (path_attr));
2615 
2616 	/*
2617 	 * Get the port speed from Loopback path information.
2618 	 */
2619 	path_attr.pa_dgids = &state->id_sgid;
2620 	path_attr.pa_num_dgids = 1;
2621 	path_attr.pa_sgid = state->id_sgid;
2622 
2623 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2624 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2625 		goto earlydone;
2626 
2627 	if (num_paths < 1)
2628 		goto earlydone;
2629 
2630 	/*
2631 	 * In case SA does not return an expected value, report the default
2632 	 * speed as 1X.
2633 	 */
2634 	ret = 1;
2635 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2636 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2637 			ret = 1;
2638 			break;
2639 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2640 			ret = 4;
2641 			break;
2642 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2643 			ret = 12;
2644 			break;
2645 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2646 			ret = 2;
2647 			break;
2648 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2649 			ret = 8;
2650 			break;
2651 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2652 			ret = 16;
2653 			break;
2654 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2655 			ret = 24;
2656 			break;
2657 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2658 			ret = 32;
2659 			break;
2660 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2661 			ret = 48;
2662 			break;
2663 	}
2664 
2665 	ifspeed *= ret;
2666 
2667 earlydone:
2668 	return (ifspeed);
2669 }
2670 
2671 /*
2672  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2673  * representing the input mcg mgid.
2674  */
2675 static ibd_mce_t *
2676 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2677 {
2678 	ibd_mce_t *ptr = list_head(mlist);
2679 
2680 	/*
2681 	 * Do plain linear search.
2682 	 */
2683 	while (ptr != NULL) {
2684 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2685 		    sizeof (ib_gid_t)) == 0)
2686 			return (ptr);
2687 		ptr = list_next(mlist, ptr);
2688 	}
2689 	return (NULL);
2690 }
2691 
2692 /*
2693  * Execute IBA JOIN.
2694  */
2695 static ibt_status_t
2696 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2697 {
2698 	ibt_mcg_attr_t mcg_attr;
2699 
2700 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2701 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2702 	mcg_attr.mc_mgid = mgid;
2703 	mcg_attr.mc_join_state = mce->mc_jstate;
2704 	mcg_attr.mc_scope = state->id_scope;
2705 	mcg_attr.mc_pkey = state->id_pkey;
2706 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2707 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2708 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2709 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2710 	    NULL, NULL));
2711 }
2712 
2713 /*
2714  * This code JOINs the port in the proper way (depending on the join
2715  * state) so that IBA fabric will forward mcg packets to/from the port.
2716  * It also attaches the QPN to the mcg so it can receive those mcg
2717  * packets. This code makes sure not to attach the mcg to the QP if
2718  * that has been previously done due to the mcg being joined with a
2719  * different join state, even though this is not required by SWG_0216,
2720  * refid 3610.
2721  */
2722 static ibd_mce_t *
2723 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2724 {
2725 	ibt_status_t ibt_status;
2726 	ibd_mce_t *mce, *tmce, *omce = NULL;
2727 	boolean_t do_attach = B_TRUE;
2728 
2729 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2730 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2731 
2732 	/*
2733 	 * For enable_multicast Full member joins, we need to do some
2734 	 * extra work. If there is already an mce on the list that
2735 	 * indicates full membership, that means the membership has
2736 	 * not yet been dropped (since the disable_multicast was issued)
2737 	 * because there are pending Tx's to the mcg; in that case, just
2738 	 * mark the mce not to be reaped when the Tx completion queues
2739 	 * an async reap operation.
2740 	 *
2741 	 * If there is already an mce on the list indicating sendonly
2742 	 * membership, try to promote to full membership. Be careful
2743 	 * not to deallocate the old mce, since there might be an AH
2744 	 * pointing to it; instead, update the old mce with new data
2745 	 * that tracks the full membership.
2746 	 */
2747 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2748 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2749 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2750 			ASSERT(omce->mc_fullreap);
2751 			omce->mc_fullreap = B_FALSE;
2752 			return (omce);
2753 		} else {
2754 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2755 		}
2756 	}
2757 
2758 	/*
2759 	 * Allocate the ibd_mce_t to track this JOIN.
2760 	 */
2761 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2762 	mce->mc_fullreap = B_FALSE;
2763 	mce->mc_jstate = jstate;
2764 
2765 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2766 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2767 		    ibt_status);
2768 		kmem_free(mce, sizeof (ibd_mce_t));
2769 		return (NULL);
2770 	}
2771 
2772 	/*
2773 	 * Is an IBA attach required? Not if the interface is already joined
2774 	 * to the mcg in a different appropriate join state.
2775 	 */
2776 	if (jstate == IB_MC_JSTATE_NON) {
2777 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2778 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2779 			do_attach = B_FALSE;
2780 	} else if (jstate == IB_MC_JSTATE_FULL) {
2781 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2782 			do_attach = B_FALSE;
2783 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2784 		do_attach = B_FALSE;
2785 	}
2786 
2787 	if (do_attach) {
2788 		/*
2789 		 * Do the IBA attach.
2790 		 */
2791 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2792 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2793 		    &mce->mc_info)) != IBT_SUCCESS) {
2794 			DPRINT(10, "ibd_join_group : failed qp attachment "
2795 			    "%d\n", ibt_status);
2796 			/*
2797 			 * NOTE that we should probably preserve the join info
2798 			 * in the list and later try to leave again at detach
2799 			 * time.
2800 			 */
2801 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2802 			    state->id_sgid, jstate);
2803 			kmem_free(mce, sizeof (ibd_mce_t));
2804 			return (NULL);
2805 		}
2806 	}
2807 
2808 	/*
2809 	 * Insert the ibd_mce_t in the proper list.
2810 	 */
2811 	if (jstate == IB_MC_JSTATE_NON) {
2812 		IBD_MCACHE_INSERT_NON(state, mce);
2813 	} else {
2814 		/*
2815 		 * Set up the mc_req fields used for reaping the
2816 		 * mcg in case of delayed tx completion (see
2817 		 * ibd_tx_cleanup()). Also done for sendonly join in
2818 		 * case we are promoted to fullmembership later and
2819 		 * keep using the same mce.
2820 		 */
2821 		mce->mc_req.rq_gid = mgid;
2822 		mce->mc_req.rq_ptr = mce;
2823 		/*
2824 		 * Check whether this is the case of trying to join
2825 		 * full member, and we were already joined send only.
2826 		 * We try to drop our SendOnly membership, but it is
2827 		 * possible that the mcg does not exist anymore (and
2828 		 * the subnet trap never reached us), so the leave
2829 		 * operation might fail.
2830 		 */
2831 		if (omce != NULL) {
2832 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2833 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2834 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2835 			bcopy(&mce->mc_info, &omce->mc_info,
2836 			    sizeof (ibt_mcg_info_t));
2837 			kmem_free(mce, sizeof (ibd_mce_t));
2838 			return (omce);
2839 		}
2840 		mutex_enter(&state->id_mc_mutex);
2841 		IBD_MCACHE_INSERT_FULL(state, mce);
2842 		mutex_exit(&state->id_mc_mutex);
2843 	}
2844 
2845 	return (mce);
2846 }
2847 
2848 /*
2849  * Called during port up event handling to attempt to reacquire full
2850  * membership to an mcg. Stripped down version of ibd_join_group().
2851  * Note that it is possible that the mcg might have gone away, and
2852  * gets recreated at this point.
2853  */
2854 static void
2855 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2856 {
2857 	ib_gid_t mgid;
2858 
2859 	/*
2860 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2861 	 * reap/leave is going to try to leave the group. We could prevent
2862 	 * that by adding a boolean flag into ibd_mce_t, if required.
2863 	 */
2864 	if (mce->mc_fullreap)
2865 		return;
2866 
2867 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2868 
2869 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2870 	    mgid.gid_guid);
2871 
2872 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2873 		ibd_print_warn(state, "Failure on port up to rejoin "
2874 		    "multicast gid %016llx:%016llx",
2875 		    (u_longlong_t)mgid.gid_prefix,
2876 		    (u_longlong_t)mgid.gid_guid);
2877 }
2878 
2879 /*
2880  * This code handles delayed Tx completion cleanups for mcg's to which
2881  * disable_multicast has been issued, regular mcg related cleanups during
2882  * disable_multicast, disable_promiscous and mcg traps, as well as
2883  * cleanups during driver detach time. Depending on the join state,
2884  * it deletes the mce from the appropriate list and issues the IBA
2885  * leave/detach; except in the disable_multicast case when the mce
2886  * is left on the active list for a subsequent Tx completion cleanup.
2887  */
2888 static void
2889 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2890     uint8_t jstate)
2891 {
2892 	ibd_mce_t *tmce;
2893 	boolean_t do_detach = B_TRUE;
2894 
2895 	/*
2896 	 * Before detaching, we must check whether the other list
2897 	 * contains the mcg; if we detach blindly, the consumer
2898 	 * who set up the other list will also stop receiving
2899 	 * traffic.
2900 	 */
2901 	if (jstate == IB_MC_JSTATE_FULL) {
2902 		/*
2903 		 * The following check is only relevant while coming
2904 		 * from the Tx completion path in the reap case.
2905 		 */
2906 		if (!mce->mc_fullreap)
2907 			return;
2908 		mutex_enter(&state->id_mc_mutex);
2909 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2910 		mutex_exit(&state->id_mc_mutex);
2911 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2912 			do_detach = B_FALSE;
2913 	} else if (jstate == IB_MC_JSTATE_NON) {
2914 		IBD_MCACHE_PULLOUT_NON(state, mce);
2915 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2916 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2917 			do_detach = B_FALSE;
2918 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2919 		mutex_enter(&state->id_mc_mutex);
2920 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2921 		mutex_exit(&state->id_mc_mutex);
2922 		do_detach = B_FALSE;
2923 	}
2924 
2925 	/*
2926 	 * If we are reacting to a mcg trap and leaving our sendonly or
2927 	 * non membership, the mcg is possibly already gone, so attempting
2928 	 * to leave might fail. On the other hand, we must try to leave
2929 	 * anyway, since this might be a trap from long ago, and we could
2930 	 * have potentially sendonly joined to a recent incarnation of
2931 	 * the mcg and are about to loose track of this information.
2932 	 */
2933 	if (do_detach) {
2934 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2935 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2936 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2937 	}
2938 
2939 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2940 	kmem_free(mce, sizeof (ibd_mce_t));
2941 }
2942 
2943 /*
2944  * Async code executed due to multicast and promiscuous disable requests
2945  * and mcg trap handling; also executed during driver detach. Mostly, a
2946  * leave and detach is done; except for the fullmember case when Tx
2947  * requests are pending, whence arrangements are made for subsequent
2948  * cleanup on Tx completion.
2949  */
2950 static void
2951 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2952 {
2953 	ipoib_mac_t mcmac;
2954 	boolean_t recycled;
2955 	ibd_mce_t *mce;
2956 
2957 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
2958 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2959 
2960 	if (jstate == IB_MC_JSTATE_NON) {
2961 		recycled = B_TRUE;
2962 		mce = IBD_MCACHE_FIND_NON(state, mgid);
2963 		/*
2964 		 * In case we are handling a mcg trap, we might not find
2965 		 * the mcg in the non list.
2966 		 */
2967 		if (mce == NULL)
2968 			return;
2969 	} else {
2970 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
2971 
2972 		/*
2973 		 * In case we are handling a mcg trap, make sure the trap
2974 		 * is not arriving late; if we have an mce that indicates
2975 		 * that we are already a fullmember, that would be a clear
2976 		 * indication that the trap arrived late (ie, is for a
2977 		 * previous incarnation of the mcg).
2978 		 */
2979 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
2980 			if ((mce == NULL) || (mce->mc_jstate ==
2981 			    IB_MC_JSTATE_FULL))
2982 				return;
2983 		} else {
2984 			ASSERT(jstate == IB_MC_JSTATE_FULL);
2985 
2986 			/*
2987 			 * If join group failed, mce will be NULL here.
2988 			 * This is because in GLDv3 driver, set multicast
2989 			 *  will always return success.
2990 			 */
2991 			if (mce == NULL)
2992 				return;
2993 
2994 			mce->mc_fullreap = B_TRUE;
2995 		}
2996 
2997 		/*
2998 		 * If no pending Tx's remain that reference the AH
2999 		 * for the mcg, recycle it from active to free list.
3000 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3001 		 * so the last completing Tx will cause an async reap
3002 		 * operation to be invoked, at which time we will drop our
3003 		 * membership to the mcg so that the pending Tx's complete
3004 		 * successfully. Refer to comments on "AH and MCE active
3005 		 * list manipulation" at top of this file. The lock protects
3006 		 * against Tx fast path and Tx cleanup code.
3007 		 */
3008 		mutex_enter(&state->id_ac_mutex);
3009 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3010 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3011 		    IB_MC_JSTATE_SEND_ONLY_NON));
3012 		mutex_exit(&state->id_ac_mutex);
3013 	}
3014 
3015 	if (recycled) {
3016 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3017 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3018 		ibd_async_reap_group(state, mce, mgid, jstate);
3019 	}
3020 }
3021 
3022 /*
3023  * Find the broadcast address as defined by IPoIB; implicitly
3024  * determines the IBA scope, mtu, tclass etc of the link the
3025  * interface is going to be a member of.
3026  */
3027 static ibt_status_t
3028 ibd_find_bgroup(ibd_state_t *state)
3029 {
3030 	ibt_mcg_attr_t mcg_attr;
3031 	uint_t numg;
3032 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3033 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3034 	    IB_MC_SCOPE_GLOBAL };
3035 	int i, mcgmtu;
3036 	boolean_t found = B_FALSE;
3037 
3038 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3039 	mcg_attr.mc_pkey = state->id_pkey;
3040 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3041 
3042 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3043 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3044 
3045 		/*
3046 		 * Look for the IPoIB broadcast group.
3047 		 */
3048 		state->id_mgid.gid_prefix =
3049 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3050 		    ((uint64_t)state->id_scope << 48) |
3051 		    ((uint32_t)(state->id_pkey << 16)));
3052 		mcg_attr.mc_mgid = state->id_mgid;
3053 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3054 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3055 			found = B_TRUE;
3056 			break;
3057 		}
3058 
3059 	}
3060 
3061 	if (!found) {
3062 		ibd_print_warn(state, "IPoIB broadcast group absent");
3063 		return (IBT_FAILURE);
3064 	}
3065 
3066 	/*
3067 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3068 	 */
3069 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3070 	if (state->id_mtu < mcgmtu) {
3071 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3072 		    "greater than port's maximum MTU %d", mcgmtu,
3073 		    state->id_mtu);
3074 		return (IBT_FAILURE);
3075 	}
3076 	state->id_mtu = mcgmtu;
3077 
3078 	return (IBT_SUCCESS);
3079 }
3080 
3081 static int
3082 ibd_alloc_tx_copybufs(ibd_state_t *state)
3083 {
3084 	ibt_mr_attr_t mem_attr;
3085 
3086 	/*
3087 	 * Allocate one big chunk for all regular tx copy bufs
3088 	 */
3089 	state->id_tx_buf_sz = state->id_mtu;
3090 	if (state->id_lso_policy && state->id_lso_capable &&
3091 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3092 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3093 	}
3094 
3095 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3096 	    state->id_tx_buf_sz, KM_SLEEP);
3097 
3098 	/*
3099 	 * Do one memory registration on the entire txbuf area
3100 	 */
3101 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3102 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3103 	mem_attr.mr_as = NULL;
3104 	mem_attr.mr_flags = IBT_MR_SLEEP;
3105 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3106 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3107 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3108 		kmem_free(state->id_tx_bufs,
3109 		    state->id_num_swqe * state->id_tx_buf_sz);
3110 		state->id_tx_bufs = NULL;
3111 		return (DDI_FAILURE);
3112 	}
3113 
3114 	return (DDI_SUCCESS);
3115 }
3116 
3117 static int
3118 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3119 {
3120 	ibt_mr_attr_t mem_attr;
3121 	ibd_lsobuf_t *buflist;
3122 	ibd_lsobuf_t *lbufp;
3123 	ibd_lsobuf_t *tail;
3124 	ibd_lsobkt_t *bktp;
3125 	uint8_t *membase;
3126 	uint8_t *memp;
3127 	uint_t memsz;
3128 	int i;
3129 
3130 	/*
3131 	 * Allocate the lso bucket
3132 	 */
3133 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3134 
3135 	/*
3136 	 * Allocate the entire lso memory and register it
3137 	 */
3138 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3139 	membase = kmem_zalloc(memsz, KM_SLEEP);
3140 
3141 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3142 	mem_attr.mr_len = memsz;
3143 	mem_attr.mr_as = NULL;
3144 	mem_attr.mr_flags = IBT_MR_SLEEP;
3145 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3146 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3147 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3148 		kmem_free(membase, memsz);
3149 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3150 		return (DDI_FAILURE);
3151 	}
3152 
3153 	/*
3154 	 * Now allocate the buflist.  Note that the elements in the buflist and
3155 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3156 	 * can always derive the address of a buflist entry from the address of
3157 	 * an lso buffer.
3158 	 */
3159 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3160 	    KM_SLEEP);
3161 
3162 	/*
3163 	 * Set up the lso buf chain
3164 	 */
3165 	memp = membase;
3166 	lbufp = buflist;
3167 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3168 		lbufp->lb_isfree = 1;
3169 		lbufp->lb_buf = memp;
3170 		lbufp->lb_next = lbufp + 1;
3171 
3172 		tail = lbufp;
3173 
3174 		memp += IBD_LSO_BUFSZ;
3175 		lbufp++;
3176 	}
3177 	tail->lb_next = NULL;
3178 
3179 	/*
3180 	 * Set up the LSO buffer information in ibd state
3181 	 */
3182 	bktp->bkt_bufl = buflist;
3183 	bktp->bkt_free_head = buflist;
3184 	bktp->bkt_mem = membase;
3185 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3186 	bktp->bkt_nfree = bktp->bkt_nelem;
3187 
3188 	state->id_lso = bktp;
3189 
3190 	return (DDI_SUCCESS);
3191 }
3192 
3193 /*
3194  * Statically allocate Tx buffer list(s).
3195  */
3196 static int
3197 ibd_init_txlist(ibd_state_t *state)
3198 {
3199 	ibd_swqe_t *swqe;
3200 	ibt_lkey_t lkey;
3201 	int i;
3202 
3203 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3204 		return (DDI_FAILURE);
3205 
3206 	if (state->id_lso_policy && state->id_lso_capable) {
3207 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3208 			state->id_lso_policy = B_FALSE;
3209 	}
3210 
3211 	/*
3212 	 * Allocate and setup the swqe list
3213 	 */
3214 	lkey = state->id_tx_mr_desc.md_lkey;
3215 	for (i = 0; i < state->id_num_swqe; i++) {
3216 		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
3217 			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
3218 			ibd_fini_txlist(state);
3219 			return (DDI_FAILURE);
3220 		}
3221 
3222 		/* add to list */
3223 		state->id_tx_list.dl_cnt++;
3224 		if (state->id_tx_list.dl_head == NULL) {
3225 			swqe->swqe_prev = NULL;
3226 			swqe->swqe_next = NULL;
3227 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3228 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3229 		} else {
3230 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3231 			swqe->swqe_next = NULL;
3232 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3233 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3234 		}
3235 	}
3236 
3237 	return (DDI_SUCCESS);
3238 }
3239 
3240 static int
3241 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3242     uint32_t *nds_p)
3243 {
3244 	ibd_lsobkt_t *bktp;
3245 	ibd_lsobuf_t *lbufp;
3246 	ibd_lsobuf_t *nextp;
3247 	ibt_lkey_t lso_lkey;
3248 	uint_t frag_sz;
3249 	uint_t num_needed;
3250 	int i;
3251 
3252 	ASSERT(sgl_p != NULL);
3253 	ASSERT(nds_p != NULL);
3254 	ASSERT(req_sz != 0);
3255 
3256 	/*
3257 	 * Determine how many bufs we'd need for the size requested
3258 	 */
3259 	num_needed = req_sz / IBD_LSO_BUFSZ;
3260 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3261 		num_needed++;
3262 
3263 	mutex_enter(&state->id_lso_lock);
3264 
3265 	/*
3266 	 * If we don't have enough lso bufs, return failure
3267 	 */
3268 	ASSERT(state->id_lso != NULL);
3269 	bktp = state->id_lso;
3270 	if (bktp->bkt_nfree < num_needed) {
3271 		mutex_exit(&state->id_lso_lock);
3272 		return (-1);
3273 	}
3274 
3275 	/*
3276 	 * Pick the first 'num_needed' bufs from the free list
3277 	 */
3278 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3279 	lbufp = bktp->bkt_free_head;
3280 	for (i = 0; i < num_needed; i++) {
3281 		ASSERT(lbufp->lb_isfree != 0);
3282 		ASSERT(lbufp->lb_buf != NULL);
3283 
3284 		nextp = lbufp->lb_next;
3285 
3286 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3287 		sgl_p[i].ds_key = lso_lkey;
3288 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3289 
3290 		lbufp->lb_isfree = 0;
3291 		lbufp->lb_next = NULL;
3292 
3293 		lbufp = nextp;
3294 	}
3295 	bktp->bkt_free_head = lbufp;
3296 
3297 	/*
3298 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3299 	 * to adjust the last sgl entry's length. Since we know we need atleast
3300 	 * one, the i-1 use below is ok.
3301 	 */
3302 	if (frag_sz) {
3303 		sgl_p[i-1].ds_len = frag_sz;
3304 	}
3305 
3306 	/*
3307 	 * Update nfree count and return
3308 	 */
3309 	bktp->bkt_nfree -= num_needed;
3310 
3311 	mutex_exit(&state->id_lso_lock);
3312 
3313 	*nds_p = num_needed;
3314 
3315 	return (0);
3316 }
3317 
3318 static void
3319 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3320 {
3321 	ibd_lsobkt_t *bktp;
3322 	ibd_lsobuf_t *lbufp;
3323 	uint8_t *lso_mem_end;
3324 	uint_t ndx;
3325 	int i;
3326 
3327 	mutex_enter(&state->id_lso_lock);
3328 
3329 	bktp = state->id_lso;
3330 	ASSERT(bktp != NULL);
3331 
3332 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3333 	for (i = 0; i < nds; i++) {
3334 		uint8_t *va;
3335 
3336 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3337 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3338 
3339 		/*
3340 		 * Figure out the buflist element this sgl buffer corresponds
3341 		 * to and put it back at the head
3342 		 */
3343 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3344 		lbufp = bktp->bkt_bufl + ndx;
3345 
3346 		ASSERT(lbufp->lb_isfree == 0);
3347 		ASSERT(lbufp->lb_buf == va);
3348 
3349 		lbufp->lb_isfree = 1;
3350 		lbufp->lb_next = bktp->bkt_free_head;
3351 		bktp->bkt_free_head = lbufp;
3352 	}
3353 	bktp->bkt_nfree += nds;
3354 
3355 	mutex_exit(&state->id_lso_lock);
3356 }
3357 
3358 static void
3359 ibd_free_tx_copybufs(ibd_state_t *state)
3360 {
3361 	/*
3362 	 * Unregister txbuf mr
3363 	 */
3364 	if (ibt_deregister_mr(state->id_hca_hdl,
3365 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3366 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3367 	}
3368 	state->id_tx_mr_hdl = NULL;
3369 
3370 	/*
3371 	 * Free txbuf memory
3372 	 */
3373 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3374 	state->id_tx_bufs = NULL;
3375 }
3376 
3377 static void
3378 ibd_free_tx_lsobufs(ibd_state_t *state)
3379 {
3380 	ibd_lsobkt_t *bktp;
3381 
3382 	mutex_enter(&state->id_lso_lock);
3383 
3384 	if ((bktp = state->id_lso) == NULL) {
3385 		mutex_exit(&state->id_lso_lock);
3386 		return;
3387 	}
3388 
3389 	/*
3390 	 * First, free the buflist
3391 	 */
3392 	ASSERT(bktp->bkt_bufl != NULL);
3393 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3394 
3395 	/*
3396 	 * Unregister the LSO memory and free it
3397 	 */
3398 	ASSERT(bktp->bkt_mr_hdl != NULL);
3399 	if (ibt_deregister_mr(state->id_hca_hdl,
3400 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3401 		DPRINT(10,
3402 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3403 	}
3404 	ASSERT(bktp->bkt_mem);
3405 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3406 
3407 	/*
3408 	 * Finally free the bucket
3409 	 */
3410 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3411 	state->id_lso = NULL;
3412 
3413 	mutex_exit(&state->id_lso_lock);
3414 }
3415 
3416 /*
3417  * Free the statically allocated Tx buffer list.
3418  */
3419 static void
3420 ibd_fini_txlist(ibd_state_t *state)
3421 {
3422 	ibd_swqe_t *node;
3423 
3424 	/*
3425 	 * Free the allocated swqes
3426 	 */
3427 	mutex_enter(&state->id_tx_list.dl_mutex);
3428 	while (state->id_tx_list.dl_head != NULL) {
3429 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3430 		state->id_tx_list.dl_head = node->swqe_next;
3431 		ASSERT(state->id_tx_list.dl_cnt > 0);
3432 		state->id_tx_list.dl_cnt--;
3433 		ibd_free_swqe(state, node);
3434 	}
3435 	mutex_exit(&state->id_tx_list.dl_mutex);
3436 
3437 	ibd_free_tx_lsobufs(state);
3438 	ibd_free_tx_copybufs(state);
3439 }
3440 
3441 /*
3442  * Allocate a single send wqe and register it so it is almost
3443  * ready to be posted to the hardware.
3444  */
3445 static int
3446 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
3447 {
3448 	ibd_swqe_t *swqe;
3449 
3450 	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
3451 	*wqe = swqe;
3452 
3453 	swqe->swqe_type = IBD_WQE_SEND;
3454 	swqe->swqe_next = NULL;
3455 	swqe->swqe_prev = NULL;
3456 	swqe->swqe_im_mblk = NULL;
3457 
3458 	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3459 	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
3460 	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3461 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3462 
3463 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3464 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3465 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3466 
3467 	/* These are set in send */
3468 	swqe->w_swr.wr_nds = 0;
3469 	swqe->w_swr.wr_sgl = NULL;
3470 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3471 
3472 	return (DDI_SUCCESS);
3473 }
3474 
3475 /*
3476  * Free an allocated send wqe.
3477  */
3478 /*ARGSUSED*/
3479 static void
3480 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3481 {
3482 	kmem_free(swqe, sizeof (ibd_swqe_t));
3483 }
3484 
3485 /*
3486  * Post a rwqe to the hardware and add it to the Rx list. The
3487  * "recycle" parameter indicates whether an old rwqe is being
3488  * recycled, or this is a new one.
3489  */
3490 static int
3491 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3492 {
3493 	ibt_status_t ibt_status;
3494 
3495 	if (recycle == B_FALSE) {
3496 		mutex_enter(&state->id_rx_list.dl_mutex);
3497 		if (state->id_rx_list.dl_head == NULL) {
3498 			rwqe->rwqe_prev = NULL;
3499 			rwqe->rwqe_next = NULL;
3500 			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3501 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3502 		} else {
3503 			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3504 			rwqe->rwqe_next = NULL;
3505 			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3506 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3507 		}
3508 		mutex_exit(&state->id_rx_list.dl_mutex);
3509 	}
3510 
3511 	mutex_enter(&state->id_rxpost_lock);
3512 	if (state->id_rx_busy) {
3513 		rwqe->w_post_link = NULL;
3514 		if (state->id_rx_head)
3515 			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
3516 		else
3517 			state->id_rx_head = rwqe;
3518 		state->id_rx_tailp = &(rwqe->w_post_link);
3519 	} else {
3520 		state->id_rx_busy = 1;
3521 		do {
3522 			mutex_exit(&state->id_rxpost_lock);
3523 
3524 			/*
3525 			 * Here we should add dl_cnt before post recv, because
3526 			 * we would have to make sure dl_cnt is updated before
3527 			 * the corresponding ibd_process_rx() is called.
3528 			 */
3529 			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3530 
3531 			ibt_status = ibt_post_recv(state->id_chnl_hdl,
3532 			    &rwqe->w_rwr, 1, NULL);
3533 			if (ibt_status != IBT_SUCCESS) {
3534 				(void) atomic_add_32_nv(
3535 				    &state->id_rx_list.dl_cnt, -1);
3536 				ibd_print_warn(state, "ibd_post_recv: "
3537 				    "posting failed, ret=%d", ibt_status);
3538 				return (DDI_FAILURE);
3539 			}
3540 
3541 			mutex_enter(&state->id_rxpost_lock);
3542 			rwqe = state->id_rx_head;
3543 			if (rwqe) {
3544 				state->id_rx_head =
3545 				    (ibd_rwqe_t *)(rwqe->w_post_link);
3546 			}
3547 		} while (rwqe);
3548 		state->id_rx_busy = 0;
3549 	}
3550 	mutex_exit(&state->id_rxpost_lock);
3551 
3552 	return (DDI_SUCCESS);
3553 }
3554 
3555 /*
3556  * Allocate the statically allocated Rx buffer list.
3557  */
3558 static int
3559 ibd_init_rxlist(ibd_state_t *state)
3560 {
3561 	ibd_rwqe_t *rwqe;
3562 	int i;
3563 
3564 	for (i = 0; i < state->id_num_rwqe; i++) {
3565 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3566 			ibd_fini_rxlist(state);
3567 			return (DDI_FAILURE);
3568 		}
3569 
3570 		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
3571 			ibd_free_rwqe(state, rwqe);
3572 			ibd_fini_rxlist(state);
3573 			return (DDI_FAILURE);
3574 		}
3575 	}
3576 
3577 	return (DDI_SUCCESS);
3578 }
3579 
3580 /*
3581  * Free the statically allocated Rx buffer list.
3582  *
3583  */
3584 static void
3585 ibd_fini_rxlist(ibd_state_t *state)
3586 {
3587 	ibd_rwqe_t *node;
3588 
3589 	mutex_enter(&state->id_rx_list.dl_mutex);
3590 	while (state->id_rx_list.dl_head != NULL) {
3591 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3592 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3593 		ASSERT(state->id_rx_list.dl_cnt > 0);
3594 		state->id_rx_list.dl_cnt--;
3595 
3596 		ibd_free_rwqe(state, node);
3597 	}
3598 	mutex_exit(&state->id_rx_list.dl_mutex);
3599 }
3600 
3601 /*
3602  * Allocate a single recv wqe and register it so it is almost
3603  * ready to be posted to the hardware.
3604  */
3605 static int
3606 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3607 {
3608 	ibt_mr_attr_t mem_attr;
3609 	ibd_rwqe_t *rwqe;
3610 
3611 	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3612 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3613 		return (DDI_FAILURE);
3614 	}
3615 	*wqe = rwqe;
3616 	rwqe->rwqe_type = IBD_WQE_RECV;
3617 	rwqe->w_state = state;
3618 	rwqe->rwqe_next = NULL;
3619 	rwqe->rwqe_prev = NULL;
3620 	rwqe->w_freeing_wqe = B_FALSE;
3621 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3622 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3623 
3624 	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3625 	    IPOIB_GRH_SIZE, KM_NOSLEEP);
3626 	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
3627 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3628 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3629 		return (DDI_FAILURE);
3630 	}
3631 
3632 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3633 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3634 	    NULL) {
3635 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3636 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3637 		    state->id_mtu + IPOIB_GRH_SIZE);
3638 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3639 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3640 		return (DDI_FAILURE);
3641 	}
3642 
3643 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3644 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3645 	mem_attr.mr_as = NULL;
3646 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3647 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3648 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3649 	    IBT_SUCCESS) {
3650 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3651 		rwqe->w_freeing_wqe = B_TRUE;
3652 		freemsg(rwqe->rwqe_im_mblk);
3653 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3654 		    state->id_mtu + IPOIB_GRH_SIZE);
3655 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3656 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3657 		return (DDI_FAILURE);
3658 	}
3659 
3660 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3661 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3662 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3663 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3664 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3665 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3666 	rwqe->w_rwr.wr_nds = 1;
3667 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3668 
3669 	return (DDI_SUCCESS);
3670 }
3671 
3672 /*
3673  * Free an allocated recv wqe.
3674  */
3675 static void
3676 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3677 {
3678 	if (ibt_deregister_mr(state->id_hca_hdl,
3679 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3680 		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
3681 		return;
3682 	}
3683 
3684 	/*
3685 	 * Indicate to the callback function that this rwqe/mblk
3686 	 * should not be recycled. The freemsg() will invoke
3687 	 * ibd_freemsg_cb().
3688 	 */
3689 	if (rwqe->rwqe_im_mblk != NULL) {
3690 		rwqe->w_freeing_wqe = B_TRUE;
3691 		freemsg(rwqe->rwqe_im_mblk);
3692 	}
3693 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3694 	    state->id_mtu + IPOIB_GRH_SIZE);
3695 	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3696 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3697 }
3698 
3699 /*
3700  * Delete the rwqe being freed from the rx list.
3701  */
3702 static void
3703 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3704 {
3705 	mutex_enter(&state->id_rx_list.dl_mutex);
3706 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3707 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3708 	else
3709 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3710 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3711 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3712 	else
3713 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3714 	mutex_exit(&state->id_rx_list.dl_mutex);
3715 }
3716 
3717 /*
3718  * IBA Rx/Tx completion queue handler. Guaranteed to be single
3719  * threaded and nonreentrant for this CQ. When using combined CQ,
3720  * this handles Tx and Rx completions. With separate CQs, this handles
3721  * only Rx completions.
3722  */
3723 /* ARGSUSED */
3724 static void
3725 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3726 {
3727 	ibd_state_t *state = (ibd_state_t *)arg;
3728 
3729 	atomic_add_64(&state->id_num_intrs, 1);
3730 
3731 	if (ibd_rx_softintr == 1)
3732 		ddi_trigger_softintr(state->id_rx);
3733 	else
3734 		(void) ibd_intr((char *)state);
3735 }
3736 
3737 /*
3738  * Separate CQ handler for Tx completions, when the Tx CQ is in
3739  * interrupt driven mode.
3740  */
3741 /* ARGSUSED */
3742 static void
3743 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3744 {
3745 	ibd_state_t *state = (ibd_state_t *)arg;
3746 
3747 	atomic_add_64(&state->id_num_intrs, 1);
3748 
3749 	if (ibd_tx_softintr == 1)
3750 		ddi_trigger_softintr(state->id_tx);
3751 	else
3752 		(void) ibd_tx_recycle((char *)state);
3753 }
3754 
3755 /*
3756  * Multicast group create/delete trap handler. These will be delivered
3757  * on a kernel thread (handling can thus block) and can be invoked
3758  * concurrently. The handler can be invoked anytime after it is
3759  * registered and before ibt_detach().
3760  */
3761 /* ARGSUSED */
3762 static void
3763 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3764     ibt_subnet_event_t *event)
3765 {
3766 	ibd_state_t *state = (ibd_state_t *)arg;
3767 	ibd_req_t *req;
3768 
3769 	/*
3770 	 * The trap handler will get invoked once for every event for
3771 	 * evert port. The input "gid" is the GID0 of the port the
3772 	 * trap came in on; we just need to act on traps that came
3773 	 * to our port, meaning the port on which the ipoib interface
3774 	 * resides. Since ipoib uses GID0 of the port, we just match
3775 	 * the gids to check whether we need to handle the trap.
3776 	 */
3777 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3778 		return;
3779 
3780 	DPRINT(10, "ibd_notices_handler : %d\n", code);
3781 
3782 	switch (code) {
3783 		case IBT_SM_EVENT_UNAVAILABLE:
3784 			/*
3785 			 * If we are in promiscuous mode or have
3786 			 * sendnonmembers, we need to print a warning
3787 			 * message right now. Else, just store the
3788 			 * information, print when we enter promiscuous
3789 			 * mode or attempt nonmember send. We might
3790 			 * also want to stop caching sendnonmember.
3791 			 */
3792 			ibd_print_warn(state, "IBA multicast support "
3793 			    "degraded due to unavailability of multicast "
3794 			    "traps");
3795 			break;
3796 		case IBT_SM_EVENT_AVAILABLE:
3797 			/*
3798 			 * If we printed a warning message above or
3799 			 * while trying to nonmember send or get into
3800 			 * promiscuous mode, print an okay message.
3801 			 */
3802 			ibd_print_warn(state, "IBA multicast support "
3803 			    "restored due to availability of multicast "
3804 			    "traps");
3805 			break;
3806 		case IBT_SM_EVENT_MCG_CREATED:
3807 		case IBT_SM_EVENT_MCG_DELETED:
3808 			/*
3809 			 * Common processing of creation/deletion traps.
3810 			 * First check if the instance is being
3811 			 * [de]initialized; back off then, without doing
3812 			 * anything more, since we are not sure if the
3813 			 * async thread is around, or whether we might
3814 			 * be racing with the detach code in ibd_m_stop()
3815 			 * that scans the mcg list.
3816 			 */
3817 			if (!ibd_async_safe(state))
3818 				return;
3819 
3820 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
3821 			req->rq_gid = event->sm_notice_gid;
3822 			req->rq_ptr = (void *)code;
3823 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
3824 			break;
3825 	}
3826 }
3827 
3828 static void
3829 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
3830 {
3831 	ib_gid_t mgid = req->rq_gid;
3832 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
3833 
3834 	DPRINT(10, "ibd_async_trap : %d\n", code);
3835 
3836 	/*
3837 	 * Atomically search the nonmember and sendonlymember lists and
3838 	 * delete.
3839 	 */
3840 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
3841 
3842 	if (state->id_prom_op == IBD_OP_COMPLETED) {
3843 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
3844 
3845 		/*
3846 		 * If in promiscuous mode, try to join/attach to the new
3847 		 * mcg. Given the unreliable out-of-order mode of trap
3848 		 * delivery, we can never be sure whether it is a problem
3849 		 * if the join fails. Thus, we warn the admin of a failure
3850 		 * if this was a creation trap. Note that the trap might
3851 		 * actually be reporting a long past event, and the mcg
3852 		 * might already have been deleted, thus we might be warning
3853 		 * in vain.
3854 		 */
3855 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
3856 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
3857 			ibd_print_warn(state, "IBA promiscuous mode missed "
3858 			    "new multicast gid %016llx:%016llx",
3859 			    (u_longlong_t)mgid.gid_prefix,
3860 			    (u_longlong_t)mgid.gid_guid);
3861 	}
3862 
3863 	/*
3864 	 * Free the request slot allocated by the subnet event thread.
3865 	 */
3866 	ibd_async_done(state);
3867 }
3868 
3869 /*
3870  * GLDv3 entry point to get capabilities.
3871  */
3872 static boolean_t
3873 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3874 {
3875 	ibd_state_t *state = arg;
3876 
3877 	switch (cap) {
3878 	case MAC_CAPAB_HCKSUM: {
3879 		uint32_t *txflags = cap_data;
3880 
3881 		/*
3882 		 * We either do full checksum or not do it at all
3883 		 */
3884 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
3885 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
3886 		else
3887 			return (B_FALSE);
3888 		break;
3889 	}
3890 
3891 	case MAC_CAPAB_LSO: {
3892 		mac_capab_lso_t *cap_lso = cap_data;
3893 
3894 		/*
3895 		 * In addition to the capability and policy, since LSO
3896 		 * relies on hw checksum, we'll not enable LSO if we
3897 		 * don't have hw checksum.  Of course, if the HCA doesn't
3898 		 * provide the reserved lkey capability, enabling LSO will
3899 		 * actually affect performance adversely, so we'll disable
3900 		 * LSO even for that case.
3901 		 */
3902 		if (!state->id_lso_policy || !state->id_lso_capable)
3903 			return (B_FALSE);
3904 
3905 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
3906 			return (B_FALSE);
3907 
3908 		if (state->id_hca_res_lkey_capab == 0) {
3909 			ibd_print_warn(state, "no reserved-lkey capability, "
3910 			    "disabling LSO");
3911 			return (B_FALSE);
3912 		}
3913 
3914 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3915 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
3916 		break;
3917 	}
3918 
3919 	default:
3920 		return (B_FALSE);
3921 	}
3922 
3923 	return (B_TRUE);
3924 }
3925 
3926 static int
3927 ibd_get_port_details(ibd_state_t *state)
3928 {
3929 	ibt_hca_portinfo_t *port_infop;
3930 	ibt_status_t ret;
3931 	uint_t psize, port_infosz;
3932 
3933 	mutex_enter(&state->id_link_mutex);
3934 
3935 	/*
3936 	 * Query for port information
3937 	 */
3938 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
3939 	    &port_infop, &psize, &port_infosz);
3940 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
3941 		mutex_exit(&state->id_link_mutex);
3942 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
3943 		    "failed, ret=%d", ret);
3944 		return (DDI_FAILURE);
3945 	}
3946 
3947 	/*
3948 	 * If the link already went down by the time we get here,
3949 	 * give up
3950 	 */
3951 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
3952 		mutex_exit(&state->id_link_mutex);
3953 		ibt_free_portinfo(port_infop, port_infosz);
3954 		DPRINT(10, "ibd_get_port_details: port is not active");
3955 		return (DDI_FAILURE);
3956 	}
3957 
3958 	/*
3959 	 * If the link is active, verify the pkey
3960 	 */
3961 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
3962 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
3963 		mutex_exit(&state->id_link_mutex);
3964 		ibt_free_portinfo(port_infop, port_infosz);
3965 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
3966 		    "failed, ret=%d", ret);
3967 		return (DDI_FAILURE);
3968 	}
3969 
3970 	state->id_mtu = (128 << port_infop->p_mtu);
3971 	state->id_sgid = *port_infop->p_sgid_tbl;
3972 	state->id_link_state = LINK_STATE_UP;
3973 
3974 	mutex_exit(&state->id_link_mutex);
3975 	ibt_free_portinfo(port_infop, port_infosz);
3976 
3977 	/*
3978 	 * Now that the port is active, record the port speed
3979 	 */
3980 	state->id_link_speed = ibd_get_portspeed(state);
3981 
3982 	return (DDI_SUCCESS);
3983 }
3984 
3985 static int
3986 ibd_alloc_cqs(ibd_state_t *state)
3987 {
3988 	ibt_hca_attr_t hca_attrs;
3989 	ibt_cq_attr_t cq_attr;
3990 	ibt_status_t ret;
3991 	uint32_t real_size;
3992 
3993 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
3994 	ASSERT(ret == IBT_SUCCESS);
3995 
3996 	/*
3997 	 * Allocate Rx/combined CQ:
3998 	 * Theoretically, there is no point in having more than #rwqe
3999 	 * plus #swqe cqe's, except that the CQ will be signalled for
4000 	 * overflow when the last wqe completes, if none of the previous
4001 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4002 	 * to make sure such overflow does not occur.
4003 	 */
4004 	cq_attr.cq_sched = NULL;
4005 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4006 
4007 	if (ibd_separate_cqs == 1) {
4008 		/*
4009 		 * Allocate Receive CQ.
4010 		 */
4011 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4012 			cq_attr.cq_size = state->id_num_rwqe + 1;
4013 		} else {
4014 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4015 			state->id_num_rwqe = cq_attr.cq_size - 1;
4016 		}
4017 
4018 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4019 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4020 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4021 			    "failed, ret=%d\n", ret);
4022 			return (DDI_FAILURE);
4023 		}
4024 
4025 		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4026 		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4027 			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4028 			    "moderation failed, ret=%d\n", ret);
4029 		}
4030 
4031 		state->id_rxwcs_size = state->id_num_rwqe + 1;
4032 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4033 		    state->id_rxwcs_size, KM_SLEEP);
4034 
4035 		/*
4036 		 * Allocate Send CQ.
4037 		 */
4038 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4039 			cq_attr.cq_size = state->id_num_swqe + 1;
4040 		} else {
4041 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4042 			state->id_num_swqe = cq_attr.cq_size - 1;
4043 		}
4044 
4045 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4046 		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4047 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4048 			    "failed, ret=%d\n", ret);
4049 			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4050 			    state->id_rxwcs_size);
4051 			(void) ibt_free_cq(state->id_rcq_hdl);
4052 			return (DDI_FAILURE);
4053 		}
4054 		if ((ret = ibt_modify_cq(state->id_scq_hdl,
4055 		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
4056 			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4057 			    "moderation failed, ret=%d\n", ret);
4058 		}
4059 
4060 		state->id_txwcs_size = state->id_num_swqe + 1;
4061 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4062 		    state->id_txwcs_size, KM_SLEEP);
4063 	} else {
4064 		/*
4065 		 * Allocate combined Send/Receive CQ.
4066 		 */
4067 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
4068 		    state->id_num_swqe + 1)) {
4069 			cq_attr.cq_size = state->id_num_rwqe +
4070 			    state->id_num_swqe + 1;
4071 		} else {
4072 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4073 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
4074 			    state->id_num_rwqe) / (state->id_num_rwqe +
4075 			    state->id_num_swqe);
4076 			state->id_num_swqe = cq_attr.cq_size - 1 -
4077 			    state->id_num_rwqe;
4078 		}
4079 
4080 		state->id_rxwcs_size = cq_attr.cq_size;
4081 		state->id_txwcs_size = state->id_rxwcs_size;
4082 
4083 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4084 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4085 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
4086 			    "failed, ret=%d\n", ret);
4087 			return (DDI_FAILURE);
4088 		}
4089 		state->id_scq_hdl = state->id_rcq_hdl;
4090 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4091 		    state->id_rxwcs_size, KM_SLEEP);
4092 		state->id_txwcs = state->id_rxwcs;
4093 	}
4094 
4095 	/*
4096 	 * Print message in case we could not allocate as many wqe's
4097 	 * as was requested.
4098 	 */
4099 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4100 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4101 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4102 	}
4103 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4104 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4105 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4106 	}
4107 
4108 	return (DDI_SUCCESS);
4109 }
4110 
4111 static int
4112 ibd_setup_ud_channel(ibd_state_t *state)
4113 {
4114 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4115 	ibt_ud_chan_query_attr_t ud_chan_attr;
4116 	ibt_status_t ret;
4117 
4118 	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
4119 	if (state->id_hca_res_lkey_capab)
4120 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4121 	if (state->id_lso_policy && state->id_lso_capable)
4122 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4123 
4124 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4125 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4126 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4127 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4128 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4129 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4130 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4131 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4132 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4133 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4134 	ud_alloc_attr.ud_clone_chan	= NULL;
4135 
4136 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4137 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4138 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4139 		    "failed, ret=%d\n", ret);
4140 		return (DDI_FAILURE);
4141 	}
4142 
4143 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4144 	    &ud_chan_attr)) != IBT_SUCCESS) {
4145 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4146 		    "failed, ret=%d\n", ret);
4147 		(void) ibt_free_channel(state->id_chnl_hdl);
4148 		return (DDI_FAILURE);
4149 	}
4150 
4151 	state->id_qpnum = ud_chan_attr.ud_qpn;
4152 
4153 	return (DDI_SUCCESS);
4154 }
4155 
4156 static int
4157 ibd_undo_m_start(ibd_state_t *state)
4158 {
4159 	uint32_t progress = state->id_mac_state;
4160 	uint_t attempts;
4161 	ibt_status_t ret;
4162 	ib_gid_t mgid;
4163 	ibd_mce_t *mce;
4164 	uint8_t jstate;
4165 
4166 	/*
4167 	 * Before we try to stop/undo whatever we did in ibd_m_start(),
4168 	 * we need to mark the link state as unknown to prevent nw
4169 	 * layer from using this instance for any new transfers.
4170 	 */
4171 	if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) {
4172 		state->id_link_state = LINK_STATE_UNKNOWN;
4173 		mac_link_update(state->id_mh, state->id_link_state);
4174 
4175 		state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4176 	}
4177 
4178 	if (progress & IBD_DRV_STARTED) {
4179 		state->id_mac_state &= (~IBD_DRV_STARTED);
4180 	}
4181 
4182 	/*
4183 	 * First, stop receive interrupts; this stops the driver from
4184 	 * handing up buffers to higher layers.  Wait for receive buffers
4185 	 * to be returned and give up after 5 seconds.
4186 	 */
4187 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4188 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4189 		attempts = 50;
4190 		while (state->id_rx_list.dl_bufs_outstanding > 0) {
4191 			delay(drv_usectohz(100000));
4192 			if (--attempts == 0) {
4193 				/*
4194 				 * There are pending bufs with the network
4195 				 * layer and we have no choice but to wait
4196 				 * for them to be done with. Reap all the
4197 				 * Tx/Rx completions that were posted since
4198 				 * we turned off the notification and
4199 				 * return failure.
4200 				 */
4201 				DPRINT(2, "ibd_undo_m_start: "
4202 				    "reclaiming failed");
4203 				ibd_poll_compq(state, state->id_rcq_hdl);
4204 				ibt_set_cq_handler(state->id_rcq_hdl,
4205 				    ibd_rcq_handler, state);
4206 				return (DDI_FAILURE);
4207 			}
4208 		}
4209 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4210 	}
4211 
4212 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4213 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4214 
4215 		mutex_enter(&state->id_trap_lock);
4216 		state->id_trap_stop = B_TRUE;
4217 		while (state->id_trap_inprog > 0)
4218 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4219 		mutex_exit(&state->id_trap_lock);
4220 
4221 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4222 	}
4223 
4224 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4225 		/*
4226 		 * Flushing the channel ensures that all pending WQE's
4227 		 * are marked with flush_error and handed to the CQ. It
4228 		 * does not guarantee the invocation of the CQ handler.
4229 		 * This call is guaranteed to return successfully for
4230 		 * UD QPNs.
4231 		 */
4232 		ret = ibt_flush_channel(state->id_chnl_hdl);
4233 		ASSERT(ret == IBT_SUCCESS);
4234 
4235 		/*
4236 		 * Turn off Tx interrupts and poll. By the time the polling
4237 		 * returns an empty indicator, we are sure we have seen all
4238 		 * pending Tx callbacks. Note that after the call to
4239 		 * ibt_set_cq_handler() returns, the old handler is
4240 		 * guaranteed not to be invoked anymore.
4241 		 */
4242 		if (ibd_separate_cqs == 1)
4243 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4244 		ibd_poll_compq(state, state->id_scq_hdl);
4245 
4246 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4247 	}
4248 
4249 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4250 		/*
4251 		 * No new async requests will be posted since the device
4252 		 * link state has been marked as unknown; completion handlers
4253 		 * have been turned off, so Tx handler will not cause any
4254 		 * more IBD_ASYNC_REAP requests.
4255 		 *
4256 		 * Queue a request for the async thread to exit, which will
4257 		 * be serviced after any pending ones. This can take a while,
4258 		 * specially if the SM is unreachable, since IBMF will slowly
4259 		 * timeout each SM request issued by the async thread.  Reap
4260 		 * the thread before continuing on, we do not want it to be
4261 		 * lingering in modunloaded code (or we could move the reap
4262 		 * to ibd_detach(), provided we keep track of the current
4263 		 * id_async_thrid somewhere safe).
4264 		 */
4265 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4266 		thread_join(state->id_async_thrid);
4267 
4268 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4269 	}
4270 
4271 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4272 		/*
4273 		 * Drop all residual full/non membership. This includes full
4274 		 * membership to the broadcast group, and any nonmembership
4275 		 * acquired during transmits. We do this after the Tx completion
4276 		 * handlers are done, since those might result in some late
4277 		 * leaves; this also eliminates a potential race with that
4278 		 * path wrt the mc full list insert/delete. Trap handling
4279 		 * has also been suppressed at this point. Thus, no locks
4280 		 * are required while traversing the mc full list.
4281 		 */
4282 		DPRINT(2, "ibd_undo_m_start: clear full cache entries");
4283 		mce = list_head(&state->id_mc_full);
4284 		while (mce != NULL) {
4285 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4286 			jstate = mce->mc_jstate;
4287 			mce = list_next(&state->id_mc_full, mce);
4288 			ibd_leave_group(state, mgid, jstate);
4289 		}
4290 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4291 	}
4292 
4293 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4294 		ibd_fini_rxlist(state);
4295 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4296 	}
4297 
4298 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4299 		ibd_fini_txlist(state);
4300 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4301 	}
4302 
4303 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4304 		(void) ibt_free_channel(state->id_chnl_hdl);
4305 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4306 	}
4307 
4308 	if (progress & IBD_DRV_CQS_ALLOCD) {
4309 		if (ibd_separate_cqs == 1) {
4310 			kmem_free(state->id_txwcs,
4311 			    sizeof (ibt_wc_t) * state->id_txwcs_size);
4312 			(void) ibt_free_cq(state->id_scq_hdl);
4313 		}
4314 
4315 		kmem_free(state->id_rxwcs,
4316 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4317 		(void) ibt_free_cq(state->id_rcq_hdl);
4318 
4319 		state->id_txwcs = NULL;
4320 		state->id_rxwcs = NULL;
4321 		state->id_scq_hdl = NULL;
4322 		state->id_rcq_hdl = NULL;
4323 
4324 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4325 	}
4326 
4327 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4328 		mod_hash_destroy_hash(state->id_ah_active_hash);
4329 		ibd_acache_fini(state);
4330 
4331 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4332 	}
4333 
4334 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4335 		ibt_free_mcg_info(state->id_mcinfo, 1);
4336 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4337 	}
4338 
4339 	return (DDI_SUCCESS);
4340 }
4341 
4342 /*
4343  * GLDv3 entry point to start hardware.
4344  */
4345 /*ARGSUSED*/
4346 static int
4347 ibd_m_start(void *arg)
4348 {
4349 	ibd_state_t *state = arg;
4350 	kthread_t *kht;
4351 	int err;
4352 
4353 	if (state->id_mac_state & IBD_DRV_STARTED)
4354 		return (DDI_SUCCESS);
4355 
4356 	/*
4357 	 * Get port details; if we fail here, very likely the port
4358 	 * state is inactive or the pkey can't be verified
4359 	 */
4360 	if (ibd_get_port_details(state) != DDI_SUCCESS) {
4361 		DPRINT(10, "ibd_m_start: ibd_get_port_details() failed");
4362 		return (EAGAIN);
4363 	}
4364 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4365 
4366 	/*
4367 	 * Find the IPoIB broadcast group
4368 	 */
4369 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4370 		DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed");
4371 		err = ENOENT;
4372 		goto m_start_fail;
4373 	}
4374 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4375 
4376 	/*
4377 	 * Initialize per-interface caches and lists; if we fail here,
4378 	 * it is most likely due to a lack of resources
4379 	 */
4380 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4381 		DPRINT(10, "ibd_m_start: ibd_acache_init() failed");
4382 		err = ENOMEM;
4383 		goto m_start_fail;
4384 	}
4385 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4386 
4387 	/*
4388 	 * Allocate send and receive completion queues
4389 	 */
4390 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4391 		DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed");
4392 		err = ENOMEM;
4393 		goto m_start_fail;
4394 	}
4395 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4396 
4397 	/*
4398 	 * Setup a UD channel
4399 	 */
4400 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4401 		err = ENOMEM;
4402 		DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed");
4403 		goto m_start_fail;
4404 	}
4405 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4406 
4407 	/*
4408 	 * Allocate and initialize the tx buffer list
4409 	 */
4410 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4411 		DPRINT(10, "ibd_m_start: ibd_init_txlist() failed");
4412 		err = ENOMEM;
4413 		goto m_start_fail;
4414 	}
4415 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4416 
4417 	/*
4418 	 * If we have separate cqs, create the send cq handler here
4419 	 */
4420 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
4421 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4422 		if (ibt_enable_cq_notify(state->id_scq_hdl,
4423 		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
4424 			DPRINT(10,
4425 			    "ibd_m_start: ibt_enable_cq_notify(scq) failed");
4426 			err = EINVAL;
4427 			goto m_start_fail;
4428 		}
4429 		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4430 	}
4431 
4432 	/*
4433 	 * Allocate and initialize the rx buffer list
4434 	 */
4435 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4436 		DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed");
4437 		err = ENOMEM;
4438 		goto m_start_fail;
4439 	}
4440 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4441 
4442 	/*
4443 	 * Join IPoIB broadcast group
4444 	 */
4445 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4446 		DPRINT(10, "ibd_m_start: ibd_join_group() failed");
4447 		err = EINVAL;
4448 		goto m_start_fail;
4449 	}
4450 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4451 
4452 	/*
4453 	 * Create the async thread; thread_create never fails.
4454 	 */
4455 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4456 	    TS_RUN, minclsyspri);
4457 	state->id_async_thrid = kht->t_did;
4458 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4459 
4460 	/*
4461 	 * When we did mac_register() in ibd_attach(), we didn't register
4462 	 * the real macaddr and we didn't have the true port mtu. Now that
4463 	 * we're almost ready, set the local mac address and broadcast
4464 	 * addresses and update gldv3 about the real values of these
4465 	 * parameters.
4466 	 */
4467 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4468 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4469 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4470 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4471 
4472 	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4473 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4474 
4475 	/*
4476 	 * Setup the receive cq handler
4477 	 */
4478 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4479 	if (ibt_enable_cq_notify(state->id_rcq_hdl,
4480 	    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
4481 		DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) failed");
4482 		err = EINVAL;
4483 		goto m_start_fail;
4484 	}
4485 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4486 
4487 	/*
4488 	 * Setup the subnet notices handler after we've initialized the acache/
4489 	 * mcache and started the async thread, both of which are required for
4490 	 * the trap handler to function properly.
4491 	 *
4492 	 * Now that the async thread has been started (and we've already done
4493 	 * a mac_register() during attach so mac_tx_update() can be called
4494 	 * if necessary without any problem), we can enable the trap handler
4495 	 * to queue requests to the async thread.
4496 	 */
4497 	ibt_register_subnet_notices(state->id_ibt_hdl,
4498 	    ibd_snet_notices_handler, state);
4499 	mutex_enter(&state->id_trap_lock);
4500 	state->id_trap_stop = B_FALSE;
4501 	mutex_exit(&state->id_trap_lock);
4502 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4503 
4504 	/*
4505 	 * Indicate link status to GLDv3 and higher layers. By default,
4506 	 * we assume we are in up state (which must have been true at
4507 	 * least at the time the broadcast mcg's were probed); if there
4508 	 * were any up/down transitions till the time we come here, the
4509 	 * async handler will have updated last known state, which we
4510 	 * use to tell GLDv3. The async handler will not send any
4511 	 * notifications to GLDv3 till we reach here in the initialization
4512 	 * sequence.
4513 	 */
4514 	state->id_mac_state |= IBD_DRV_STARTED;
4515 	mac_link_update(state->id_mh, state->id_link_state);
4516 
4517 	return (DDI_SUCCESS);
4518 
4519 m_start_fail:
4520 	/*
4521 	 * If we ran into a problem during ibd_m_start() and ran into
4522 	 * some other problem during undoing our partial work, we can't
4523 	 * do anything about it.  Ignore any errors we might get from
4524 	 * ibd_undo_m_start() and just return the original error we got.
4525 	 */
4526 	(void) ibd_undo_m_start(state);
4527 	return (err);
4528 }
4529 
4530 /*
4531  * GLDv3 entry point to stop hardware from receiving packets.
4532  */
4533 /*ARGSUSED*/
4534 static void
4535 ibd_m_stop(void *arg)
4536 {
4537 	ibd_state_t *state = arg;
4538 
4539 	/*
4540 	 * Since ibd_m_stop() doesn't expect any return, we cannot
4541 	 * fail even if we run into some problem with ibd_undo_m_start().
4542 	 * The best we can do is to leave it in a good state, so
4543 	 * perhaps a future unplumb will succeed.
4544 	 */
4545 	(void) ibd_undo_m_start(state);
4546 }
4547 
4548 /*
4549  * GLDv3 entry point to modify device's mac address. We do not
4550  * allow address modifications.
4551  */
4552 static int
4553 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4554 {
4555 	ibd_state_t *state = arg;
4556 
4557 	/*
4558 	 * Don't bother even comparing the macaddr if we haven't
4559 	 * completed ibd_m_start().
4560 	 */
4561 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4562 		return (0);
4563 
4564 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4565 		return (0);
4566 	else
4567 		return (EINVAL);
4568 }
4569 
4570 /*
4571  * The blocking part of the IBA join/leave operations are done out
4572  * of here on the async thread.
4573  */
4574 static void
4575 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4576 {
4577 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4578 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4579 
4580 	if (op == IBD_ASYNC_JOIN) {
4581 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4582 			ibd_print_warn(state, "Joint multicast group failed :"
4583 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4584 		}
4585 	} else {
4586 		/*
4587 		 * Here, we must search for the proper mcg_info and
4588 		 * use that to leave the group.
4589 		 */
4590 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4591 	}
4592 }
4593 
4594 /*
4595  * GLDv3 entry point for multicast enable/disable requests.
4596  * This function queues the operation to the async thread and
4597  * return success for a valid multicast address.
4598  */
4599 static int
4600 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4601 {
4602 	ibd_state_t *state = (ibd_state_t *)arg;
4603 	ipoib_mac_t maddr, *mcast;
4604 	ib_gid_t mgid;
4605 	ibd_req_t *req;
4606 
4607 	/*
4608 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4609 	 * have been started and id_bcaddr wouldn't be set, so there's
4610 	 * no point in continuing.
4611 	 */
4612 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4613 		return (0);
4614 
4615 	/*
4616 	 * The incoming multicast address might not be aligned properly
4617 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4618 	 * it to look like one though, to get the offsets of the mc gid,
4619 	 * since we know we are not going to dereference any values with
4620 	 * the ipoib_mac_t pointer.
4621 	 */
4622 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4623 	mcast = &maddr;
4624 
4625 	/*
4626 	 * Check validity of MCG address. We could additionally check
4627 	 * that a enable/disable is not being issued on the "broadcast"
4628 	 * mcg, but since this operation is only invokable by priviledged
4629 	 * programs anyway, we allow the flexibility to those dlpi apps.
4630 	 * Note that we do not validate the "scope" of the IBA mcg.
4631 	 */
4632 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4633 		return (EINVAL);
4634 
4635 	/*
4636 	 * fill in multicast pkey and scope
4637 	 */
4638 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4639 
4640 	/*
4641 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4642 	 * nothing (i.e. we stay JOINed to the broadcast group done in
4643 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
4644 	 * requires to be joined to broadcast groups at all times.
4645 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4646 	 * depends on this.
4647 	 */
4648 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4649 		return (0);
4650 
4651 	ibd_n2h_gid(mcast, &mgid);
4652 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4653 	if (req == NULL)
4654 		return (ENOMEM);
4655 
4656 	req->rq_gid = mgid;
4657 
4658 	if (add) {
4659 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4660 		    mgid.gid_prefix, mgid.gid_guid);
4661 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4662 	} else {
4663 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4664 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4665 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4666 	}
4667 	return (0);
4668 }
4669 
4670 /*
4671  * The blocking part of the IBA promiscuous operations are done
4672  * out of here on the async thread. The dlpireq parameter indicates
4673  * whether this invocation is due to a dlpi request or due to
4674  * a port up/down event.
4675  */
4676 static void
4677 ibd_async_unsetprom(ibd_state_t *state)
4678 {
4679 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4680 	ib_gid_t mgid;
4681 
4682 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4683 
4684 	while (mce != NULL) {
4685 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4686 		mce = list_next(&state->id_mc_non, mce);
4687 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4688 	}
4689 	state->id_prom_op = IBD_OP_NOTSTARTED;
4690 }
4691 
4692 /*
4693  * The blocking part of the IBA promiscuous operations are done
4694  * out of here on the async thread. The dlpireq parameter indicates
4695  * whether this invocation is due to a dlpi request or due to
4696  * a port up/down event.
4697  */
4698 static void
4699 ibd_async_setprom(ibd_state_t *state)
4700 {
4701 	ibt_mcg_attr_t mcg_attr;
4702 	ibt_mcg_info_t *mcg_info;
4703 	ib_gid_t mgid;
4704 	uint_t numg;
4705 	int i;
4706 	char ret = IBD_OP_COMPLETED;
4707 
4708 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4709 
4710 	/*
4711 	 * Obtain all active MC groups on the IB fabric with
4712 	 * specified criteria (scope + Pkey + Qkey + mtu).
4713 	 */
4714 	bzero(&mcg_attr, sizeof (mcg_attr));
4715 	mcg_attr.mc_pkey = state->id_pkey;
4716 	mcg_attr.mc_scope = state->id_scope;
4717 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4718 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4719 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4720 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4721 	    IBT_SUCCESS) {
4722 		ibd_print_warn(state, "Could not get list of IBA multicast "
4723 		    "groups");
4724 		ret = IBD_OP_ERRORED;
4725 		goto done;
4726 	}
4727 
4728 	/*
4729 	 * Iterate over the returned mcg's and join as NonMember
4730 	 * to the IP mcg's.
4731 	 */
4732 	for (i = 0; i < numg; i++) {
4733 		/*
4734 		 * Do a NonMember JOIN on the MC group.
4735 		 */
4736 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4737 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4738 			ibd_print_warn(state, "IBA promiscuous mode missed "
4739 			    "multicast gid %016llx:%016llx",
4740 			    (u_longlong_t)mgid.gid_prefix,
4741 			    (u_longlong_t)mgid.gid_guid);
4742 	}
4743 
4744 	ibt_free_mcg_info(mcg_info, numg);
4745 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4746 done:
4747 	state->id_prom_op = ret;
4748 }
4749 
4750 /*
4751  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4752  * GLDv3 assumes phys state receives more packets than multi state,
4753  * which is not true for IPoIB. Thus, treat the multi and phys
4754  * promiscuous states the same way to work with GLDv3's assumption.
4755  */
4756 static int
4757 ibd_m_promisc(void *arg, boolean_t on)
4758 {
4759 	ibd_state_t *state = (ibd_state_t *)arg;
4760 	ibd_req_t *req;
4761 
4762 	/*
4763 	 * Async thread wouldn't have been started if we haven't
4764 	 * passed ibd_m_start()
4765 	 */
4766 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4767 		return (0);
4768 
4769 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4770 	if (req == NULL)
4771 		return (ENOMEM);
4772 	if (on) {
4773 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4774 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
4775 	} else {
4776 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4777 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
4778 	}
4779 
4780 	return (0);
4781 }
4782 
4783 /*
4784  * GLDv3 entry point for gathering statistics.
4785  */
4786 static int
4787 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
4788 {
4789 	ibd_state_t *state = (ibd_state_t *)arg;
4790 
4791 	switch (stat) {
4792 	case MAC_STAT_IFSPEED:
4793 		*val = state->id_link_speed;
4794 		break;
4795 	case MAC_STAT_MULTIRCV:
4796 		*val = state->id_multi_rcv;
4797 		break;
4798 	case MAC_STAT_BRDCSTRCV:
4799 		*val = state->id_brd_rcv;
4800 		break;
4801 	case MAC_STAT_MULTIXMT:
4802 		*val = state->id_multi_xmt;
4803 		break;
4804 	case MAC_STAT_BRDCSTXMT:
4805 		*val = state->id_brd_xmt;
4806 		break;
4807 	case MAC_STAT_RBYTES:
4808 		*val = state->id_rcv_bytes;
4809 		break;
4810 	case MAC_STAT_IPACKETS:
4811 		*val = state->id_rcv_pkt;
4812 		break;
4813 	case MAC_STAT_OBYTES:
4814 		*val = state->id_xmt_bytes;
4815 		break;
4816 	case MAC_STAT_OPACKETS:
4817 		*val = state->id_xmt_pkt;
4818 		break;
4819 	case MAC_STAT_OERRORS:
4820 		*val = state->id_ah_error;	/* failed AH translation */
4821 		break;
4822 	case MAC_STAT_IERRORS:
4823 		*val = 0;
4824 		break;
4825 	case MAC_STAT_NOXMTBUF:
4826 		*val = state->id_tx_short;
4827 		break;
4828 	case MAC_STAT_NORCVBUF:
4829 	default:
4830 		return (ENOTSUP);
4831 	}
4832 
4833 	return (0);
4834 }
4835 
4836 static void
4837 ibd_async_txsched(ibd_state_t *state)
4838 {
4839 	ibd_req_t *req;
4840 	int ret;
4841 
4842 	if (ibd_txcomp_poll)
4843 		ibd_poll_compq(state, state->id_scq_hdl);
4844 
4845 	ret = ibd_resume_transmission(state);
4846 	if (ret && ibd_txcomp_poll) {
4847 		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
4848 			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
4849 		else {
4850 			ibd_print_warn(state, "ibd_async_txsched: "
4851 			    "no memory, can't schedule work slot");
4852 		}
4853 	}
4854 }
4855 
4856 static int
4857 ibd_resume_transmission(ibd_state_t *state)
4858 {
4859 	int flag;
4860 	int met_thresh = 0;
4861 	int ret = -1;
4862 
4863 	mutex_enter(&state->id_sched_lock);
4864 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
4865 		met_thresh = (state->id_tx_list.dl_cnt >
4866 		    IBD_FREE_SWQES_THRESH);
4867 		flag = IBD_RSRC_SWQE;
4868 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
4869 		ASSERT(state->id_lso != NULL);
4870 		met_thresh = (state->id_lso->bkt_nfree >
4871 		    IBD_FREE_LSOS_THRESH);
4872 		flag = IBD_RSRC_LSOBUF;
4873 	}
4874 	if (met_thresh) {
4875 		state->id_sched_needed &= ~flag;
4876 		ret = 0;
4877 	}
4878 	mutex_exit(&state->id_sched_lock);
4879 
4880 	if (ret == 0)
4881 		mac_tx_update(state->id_mh);
4882 
4883 	return (ret);
4884 }
4885 
4886 /*
4887  * Release the send wqe back into free list.
4888  */
4889 static void
4890 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
4891 {
4892 	/*
4893 	 * Add back on Tx list for reuse.
4894 	 */
4895 	swqe->swqe_next = NULL;
4896 	mutex_enter(&state->id_tx_list.dl_mutex);
4897 	if (state->id_tx_list.dl_pending_sends) {
4898 		state->id_tx_list.dl_pending_sends = B_FALSE;
4899 	}
4900 	if (state->id_tx_list.dl_head == NULL) {
4901 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
4902 	} else {
4903 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
4904 	}
4905 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
4906 	state->id_tx_list.dl_cnt++;
4907 	mutex_exit(&state->id_tx_list.dl_mutex);
4908 }
4909 
4910 /*
4911  * Acquire a send wqe from free list.
4912  * Returns error number and send wqe pointer.
4913  */
4914 static int
4915 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
4916 {
4917 	int rc = 0;
4918 	ibd_swqe_t *wqe;
4919 
4920 	/*
4921 	 * Check and reclaim some of the completed Tx requests.
4922 	 * If someone else is already in this code and pulling Tx
4923 	 * completions, no need to poll, since the current lock holder
4924 	 * will do the work anyway. Normally, we poll for completions
4925 	 * every few Tx attempts, but if we are short on Tx descriptors,
4926 	 * we always try to poll.
4927 	 */
4928 	if ((ibd_txcomp_poll == 1) &&
4929 	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
4930 		ibd_poll_compq(state, state->id_scq_hdl);
4931 	}
4932 
4933 	/*
4934 	 * Grab required transmit wqes.
4935 	 */
4936 	mutex_enter(&state->id_tx_list.dl_mutex);
4937 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
4938 	if (wqe != NULL) {
4939 		state->id_tx_list.dl_cnt -= 1;
4940 		state->id_tx_list.dl_head = wqe->swqe_next;
4941 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
4942 			state->id_tx_list.dl_tail = NULL;
4943 	} else {
4944 		/*
4945 		 * If we did not find the number we were looking for, flag
4946 		 * no resource. Adjust list appropriately in either case.
4947 		 */
4948 		rc = ENOENT;
4949 		state->id_tx_list.dl_pending_sends = B_TRUE;
4950 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
4951 		atomic_add_64(&state->id_tx_short, 1);
4952 	}
4953 	mutex_exit(&state->id_tx_list.dl_mutex);
4954 	*swqe = wqe;
4955 
4956 	return (rc);
4957 }
4958 
4959 static int
4960 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
4961     ibt_ud_dest_hdl_t ud_dest)
4962 {
4963 	mblk_t	*nmp;
4964 	int iph_len, tcph_len;
4965 	ibt_wr_lso_t *lso;
4966 	uintptr_t ip_start, tcp_start;
4967 	uint8_t *dst;
4968 	uint_t pending, mblen;
4969 
4970 	/*
4971 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
4972 	 * we need to adjust it here for lso.
4973 	 */
4974 	lso = &(node->w_swr.wr.ud_lso);
4975 	lso->lso_ud_dest = ud_dest;
4976 	lso->lso_mss = mss;
4977 
4978 	/*
4979 	 * Calculate the LSO header size and set it in the UD LSO structure.
4980 	 * Note that the only assumption we make is that each of the IPoIB,
4981 	 * IP and TCP headers will be contained in a single mblk fragment;
4982 	 * together, the headers may span multiple mblk fragments.
4983 	 */
4984 	nmp = mp;
4985 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
4986 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
4987 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
4988 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
4989 		nmp = nmp->b_cont;
4990 
4991 	}
4992 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
4993 
4994 	tcp_start = ip_start + iph_len;
4995 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
4996 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
4997 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
4998 		nmp = nmp->b_cont;
4999 	}
5000 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5001 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5002 
5003 	/*
5004 	 * If the lso header fits entirely within a single mblk fragment,
5005 	 * we'll avoid an additional copy of the lso header here and just
5006 	 * pass the b_rptr of the mblk directly.
5007 	 *
5008 	 * If this isn't true, we'd have to allocate for it explicitly.
5009 	 */
5010 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5011 		lso->lso_hdr = mp->b_rptr;
5012 	} else {
5013 		/* On work completion, remember to free this allocated hdr */
5014 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5015 		if (lso->lso_hdr == NULL) {
5016 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5017 			    "sz = %d", lso->lso_hdr_sz);
5018 			lso->lso_hdr_sz = 0;
5019 			lso->lso_mss = 0;
5020 			return (-1);
5021 		}
5022 	}
5023 
5024 	/*
5025 	 * Copy in the lso header only if we need to
5026 	 */
5027 	if (lso->lso_hdr != mp->b_rptr) {
5028 		dst = lso->lso_hdr;
5029 		pending = lso->lso_hdr_sz;
5030 
5031 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5032 			mblen = MBLKL(nmp);
5033 			if (pending > mblen) {
5034 				bcopy(nmp->b_rptr, dst, mblen);
5035 				dst += mblen;
5036 				pending -= mblen;
5037 			} else {
5038 				bcopy(nmp->b_rptr, dst, pending);
5039 				break;
5040 			}
5041 		}
5042 	}
5043 
5044 	return (0);
5045 }
5046 
5047 static void
5048 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5049 {
5050 	ibt_wr_lso_t *lso;
5051 
5052 	if ((!node) || (!mp))
5053 		return;
5054 
5055 	/*
5056 	 * Free any header space that we might've allocated if we
5057 	 * did an LSO
5058 	 */
5059 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5060 		lso = &(node->w_swr.wr.ud_lso);
5061 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5062 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5063 			lso->lso_hdr = NULL;
5064 			lso->lso_hdr_sz = 0;
5065 		}
5066 	}
5067 }
5068 
5069 static void
5070 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5071 {
5072 	uint_t		i;
5073 	uint_t		num_posted;
5074 	uint_t		n_wrs;
5075 	ibt_status_t	ibt_status;
5076 	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
5077 	ibd_swqe_t	*elem;
5078 	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
5079 
5080 	node->swqe_next = NULL;
5081 
5082 	mutex_enter(&state->id_txpost_lock);
5083 
5084 	/*
5085 	 * Enqueue the new node in chain of wqes to send
5086 	 */
5087 	if (state->id_tx_head) {
5088 		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
5089 	} else {
5090 		state->id_tx_head = node;
5091 	}
5092 	state->id_tx_tailp = &(node->swqe_next);
5093 
5094 	/*
5095 	 * If someone else is helping out with the sends,
5096 	 * just go back
5097 	 */
5098 	if (state->id_tx_busy) {
5099 		mutex_exit(&state->id_txpost_lock);
5100 		return;
5101 	}
5102 
5103 	/*
5104 	 * Otherwise, mark the flag to indicate that we'll be
5105 	 * doing the dispatch of what's there in the wqe chain
5106 	 */
5107 	state->id_tx_busy = 1;
5108 
5109 	while (state->id_tx_head) {
5110 		/*
5111 		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
5112 		 * at a time if possible, and keep posting them.
5113 		 */
5114 		for (n_wrs = 0, elem = state->id_tx_head;
5115 		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
5116 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5117 
5118 			nodes[n_wrs] = elem;
5119 			wrs[n_wrs] = elem->w_swr;
5120 		}
5121 		state->id_tx_head = elem;
5122 
5123 		/*
5124 		 * Release the txpost lock before posting the
5125 		 * send request to the hca; if the posting fails
5126 		 * for some reason, we'll never receive completion
5127 		 * intimation, so we'll need to cleanup.
5128 		 */
5129 		mutex_exit(&state->id_txpost_lock);
5130 
5131 		ASSERT(n_wrs != 0);
5132 
5133 		/*
5134 		 * If posting fails for some reason, we'll never receive
5135 		 * completion intimation, so we'll need to cleanup. But
5136 		 * we need to make sure we don't clean up nodes whose
5137 		 * wrs have been successfully posted. We assume that the
5138 		 * hca driver returns on the first failure to post and
5139 		 * therefore the first 'num_posted' entries don't need
5140 		 * cleanup here.
5141 		 */
5142 		num_posted = 0;
5143 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5144 		    wrs, n_wrs, &num_posted);
5145 		if (ibt_status != IBT_SUCCESS) {
5146 
5147 			ibd_print_warn(state, "ibd_post_send: "
5148 			    "posting multiple wrs failed: "
5149 			    "requested=%d, done=%d, ret=%d",
5150 			    n_wrs, num_posted, ibt_status);
5151 
5152 			for (i = num_posted; i < n_wrs; i++)
5153 				ibd_tx_cleanup(state, nodes[i]);
5154 		}
5155 
5156 		/*
5157 		 * Grab the mutex before we go and check the tx Q again
5158 		 */
5159 		mutex_enter(&state->id_txpost_lock);
5160 	}
5161 
5162 	state->id_tx_busy = 0;
5163 	mutex_exit(&state->id_txpost_lock);
5164 }
5165 
5166 static int
5167 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5168     uint_t lsohdr_sz)
5169 {
5170 	ibt_wr_ds_t *sgl;
5171 	ibt_status_t ibt_status;
5172 	mblk_t *nmp;
5173 	mblk_t *data_mp;
5174 	uchar_t *bufp;
5175 	size_t blksize;
5176 	size_t skip;
5177 	size_t avail;
5178 	uint_t pktsize;
5179 	uint_t frag_len;
5180 	uint_t pending_hdr;
5181 	uint_t hiwm;
5182 	int nmblks;
5183 	int i;
5184 
5185 	/*
5186 	 * Let's skip ahead to the data if this is LSO
5187 	 */
5188 	data_mp = mp;
5189 	pending_hdr = 0;
5190 	if (lsohdr_sz) {
5191 		pending_hdr = lsohdr_sz;
5192 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5193 			frag_len = nmp->b_wptr - nmp->b_rptr;
5194 			if (frag_len > pending_hdr)
5195 				break;
5196 			pending_hdr -= frag_len;
5197 		}
5198 		data_mp = nmp;	/* start of data past lso header */
5199 		ASSERT(data_mp != NULL);
5200 	}
5201 
5202 	/*
5203 	 * Calculate the size of message data and number of msg blocks
5204 	 */
5205 	pktsize = 0;
5206 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5207 	    nmp = nmp->b_cont, nmblks++) {
5208 		pktsize += MBLKL(nmp);
5209 	}
5210 	pktsize -= pending_hdr;
5211 
5212 	/*
5213 	 * Translating the virtual address regions into physical regions
5214 	 * for using the Reserved LKey feature results in a wr sgl that
5215 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
5216 	 * we'll fix a high-water mark (65%) for when we should stop.
5217 	 */
5218 	hiwm = (state->id_max_sqseg * 65) / 100;
5219 
5220 	/*
5221 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5222 	 * "copy-threshold", and if the number of mp fragments is less than
5223 	 * the maximum acceptable.
5224 	 */
5225 	if ((state->id_hca_res_lkey_capab) &&
5226 	    (pktsize > IBD_TX_COPY_THRESH) &&
5227 	    (nmblks < hiwm)) {
5228 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5229 		ibt_iov_attr_t iov_attr;
5230 
5231 		iov_attr.iov_as = NULL;
5232 		iov_attr.iov = iov_arr;
5233 		iov_attr.iov_buf = NULL;
5234 		iov_attr.iov_list_len = nmblks;
5235 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5236 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5237 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5238 
5239 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5240 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5241 			iov_arr[i].iov_len = MBLKL(nmp);
5242 			if (i == 0) {
5243 				iov_arr[i].iov_addr += pending_hdr;
5244 				iov_arr[i].iov_len -= pending_hdr;
5245 			}
5246 		}
5247 
5248 		node->w_buftype = IBD_WQE_MAPPED;
5249 		node->w_swr.wr_sgl = node->w_sgl;
5250 
5251 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5252 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5253 		if (ibt_status != IBT_SUCCESS) {
5254 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5255 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5256 			goto ibd_copy_path;
5257 		}
5258 
5259 		return (0);
5260 	}
5261 
5262 ibd_copy_path:
5263 	if (pktsize <= state->id_tx_buf_sz) {
5264 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5265 		node->w_swr.wr_nds = 1;
5266 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5267 		node->w_buftype = IBD_WQE_TXBUF;
5268 
5269 		/*
5270 		 * Even though this is the copy path for transfers less than
5271 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5272 		 * is possible the first data mblk fragment (data_mp) still
5273 		 * contains part of the LSO header that we need to skip.
5274 		 */
5275 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5276 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5277 			blksize = MBLKL(nmp) - pending_hdr;
5278 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5279 			bufp += blksize;
5280 			pending_hdr = 0;
5281 		}
5282 
5283 		return (0);
5284 	}
5285 
5286 	/*
5287 	 * Copy path for transfers greater than id_tx_buf_sz
5288 	 */
5289 	node->w_swr.wr_sgl = node->w_sgl;
5290 	if (ibd_acquire_lsobufs(state, pktsize,
5291 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5292 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5293 		return (-1);
5294 	}
5295 	node->w_buftype = IBD_WQE_LSOBUF;
5296 
5297 	/*
5298 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5299 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5300 	 * need to skip part of the LSO header in the first fragment
5301 	 * as before.
5302 	 */
5303 	nmp = data_mp;
5304 	skip = pending_hdr;
5305 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5306 		sgl = node->w_swr.wr_sgl + i;
5307 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5308 		avail = IBD_LSO_BUFSZ;
5309 		while (nmp && avail) {
5310 			blksize = MBLKL(nmp) - skip;
5311 			if (blksize > avail) {
5312 				bcopy(nmp->b_rptr + skip, bufp, avail);
5313 				skip += avail;
5314 				avail = 0;
5315 			} else {
5316 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5317 				skip = 0;
5318 				avail -= blksize;
5319 				bufp += blksize;
5320 				nmp = nmp->b_cont;
5321 			}
5322 		}
5323 	}
5324 
5325 	return (0);
5326 }
5327 
5328 /*
5329  * Schedule a completion queue polling to reap the resource we're
5330  * short on.  If we implement the change to reap tx completions
5331  * in a separate thread, we'll need to wake up that thread here.
5332  */
5333 static int
5334 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5335 {
5336 	ibd_req_t *req;
5337 
5338 	mutex_enter(&state->id_sched_lock);
5339 	state->id_sched_needed |= resource_type;
5340 	mutex_exit(&state->id_sched_lock);
5341 
5342 	/*
5343 	 * If we are asked to queue a work entry, we need to do it
5344 	 */
5345 	if (q_flag) {
5346 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5347 		if (req == NULL)
5348 			return (-1);
5349 
5350 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5351 	}
5352 
5353 	return (0);
5354 }
5355 
5356 /*
5357  * The passed in packet has this format:
5358  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5359  */
5360 static boolean_t
5361 ibd_send(ibd_state_t *state, mblk_t *mp)
5362 {
5363 	ibd_ace_t *ace;
5364 	ibd_swqe_t *node;
5365 	ipoib_mac_t *dest;
5366 	ib_header_info_t *ipibp;
5367 	ip6_t *ip6h;
5368 	uint_t pktsize;
5369 	uint32_t mss;
5370 	uint32_t hckflags;
5371 	uint32_t lsoflags = 0;
5372 	uint_t lsohdr_sz = 0;
5373 	int ret, len;
5374 	boolean_t dofree = B_FALSE;
5375 	boolean_t rc;
5376 
5377 	/*
5378 	 * If we aren't done with the device initialization and start,
5379 	 * we shouldn't be here.
5380 	 */
5381 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5382 		return (B_FALSE);
5383 
5384 	node = NULL;
5385 	if (ibd_acquire_swqe(state, &node) != 0) {
5386 		/*
5387 		 * If we don't have an swqe available, schedule a transmit
5388 		 * completion queue cleanup and hold off on sending more
5389 		 * more packets until we have some free swqes
5390 		 */
5391 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
5392 			return (B_FALSE);
5393 
5394 		/*
5395 		 * If a poll cannot be scheduled, we have no choice but
5396 		 * to drop this packet
5397 		 */
5398 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5399 		return (B_TRUE);
5400 	}
5401 
5402 	/*
5403 	 * Initialize the commonly used fields in swqe to NULL to protect
5404 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5405 	 * failure.
5406 	 */
5407 	node->swqe_im_mblk = NULL;
5408 	node->w_swr.wr_nds = 0;
5409 	node->w_swr.wr_sgl = NULL;
5410 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5411 
5412 	/*
5413 	 * Obtain an address handle for the destination.
5414 	 */
5415 	ipibp = (ib_header_info_t *)mp->b_rptr;
5416 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5417 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5418 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5419 
5420 	pktsize = msgsize(mp);
5421 
5422 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5423 	atomic_inc_64(&state->id_xmt_pkt);
5424 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5425 		atomic_inc_64(&state->id_brd_xmt);
5426 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5427 		atomic_inc_64(&state->id_multi_xmt);
5428 
5429 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5430 		node->w_ahandle = ace;
5431 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5432 	} else {
5433 		DPRINT(5,
5434 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5435 		    ((ret == EFAULT) ? "failed" : "queued"),
5436 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5437 		    htonl(dest->ipoib_gidpref[1]),
5438 		    htonl(dest->ipoib_gidsuff[0]),
5439 		    htonl(dest->ipoib_gidsuff[1]));
5440 		node->w_ahandle = NULL;
5441 
5442 		/*
5443 		 * for the poll mode, it is probably some cqe pending in the
5444 		 * cq. So ibd has to poll cq here, otherwise acache probably
5445 		 * may not be recycled.
5446 		 */
5447 		if (ibd_txcomp_poll == 1)
5448 			ibd_poll_compq(state, state->id_scq_hdl);
5449 
5450 		/*
5451 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5452 		 * can not find a path for the specific dest address. We
5453 		 * should get rid of this kind of packet.  We also should get
5454 		 * rid of the packet if we cannot schedule a poll via the
5455 		 * async thread.  For the normal case, ibd will return the
5456 		 * packet to upper layer and wait for AH creating.
5457 		 *
5458 		 * Note that we always queue a work slot entry for the async
5459 		 * thread when we fail AH lookup (even in intr mode); this is
5460 		 * due to the convoluted way the code currently looks for AH.
5461 		 */
5462 		if (ret == EFAULT) {
5463 			dofree = B_TRUE;
5464 			rc = B_TRUE;
5465 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5466 			dofree = B_TRUE;
5467 			rc = B_TRUE;
5468 		} else {
5469 			dofree = B_FALSE;
5470 			rc = B_FALSE;
5471 		}
5472 		goto ibd_send_fail;
5473 	}
5474 
5475 	/*
5476 	 * For ND6 packets, padding is at the front of the source lladdr.
5477 	 * Insert the padding at front.
5478 	 */
5479 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) {
5480 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5481 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5482 			    sizeof (ib_header_info_t))) {
5483 				DPRINT(10, "ibd_send: pullupmsg failure ");
5484 				dofree = B_TRUE;
5485 				rc = B_TRUE;
5486 				goto ibd_send_fail;
5487 			}
5488 			ipibp = (ib_header_info_t *)mp->b_rptr;
5489 		}
5490 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5491 		    sizeof (ib_header_info_t));
5492 		len = ntohs(ip6h->ip6_plen);
5493 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5494 			mblk_t	*pad;
5495 
5496 			pad = allocb(4, 0);
5497 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5498 			linkb(mp, pad);
5499 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5500 			    IPV6_HDR_LEN + len + 4) {
5501 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5502 				    IPV6_HDR_LEN + len + 4)) {
5503 					DPRINT(10, "ibd_send: pullupmsg "
5504 					    "failure ");
5505 					dofree = B_TRUE;
5506 					rc = B_TRUE;
5507 					goto ibd_send_fail;
5508 				}
5509 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5510 				    sizeof (ib_header_info_t));
5511 			}
5512 
5513 			/* LINTED: E_CONSTANT_CONDITION */
5514 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5515 		}
5516 	}
5517 
5518 	mp->b_rptr += sizeof (ib_addrs_t);
5519 
5520 	/*
5521 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5522 	 * ud destination, the opcode and the LSO header information to the
5523 	 * work request.
5524 	 */
5525 	lso_info_get(mp, &mss, &lsoflags);
5526 	if ((lsoflags & HW_LSO) != HW_LSO) {
5527 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5528 		lsohdr_sz = 0;
5529 	} else {
5530 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5531 			/*
5532 			 * The routine can only fail if there's no memory; we
5533 			 * can only drop the packet if this happens
5534 			 */
5535 			ibd_print_warn(state,
5536 			    "ibd_send: no memory, lso posting failed");
5537 			dofree = B_TRUE;
5538 			rc = B_TRUE;
5539 			goto ibd_send_fail;
5540 		}
5541 
5542 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5543 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5544 	}
5545 
5546 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5547 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5548 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5549 	else
5550 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5551 
5552 	/*
5553 	 * Prepare the sgl for posting; the routine can only fail if there's
5554 	 * no lso buf available for posting. If this is the case, we should
5555 	 * probably resched for lso bufs to become available and then try again.
5556 	 */
5557 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5558 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5559 			dofree = B_TRUE;
5560 			rc = B_TRUE;
5561 		} else {
5562 			dofree = B_FALSE;
5563 			rc = B_FALSE;
5564 		}
5565 		goto ibd_send_fail;
5566 	}
5567 	node->swqe_im_mblk = mp;
5568 
5569 	/*
5570 	 * Queue the wqe to hardware; since we can now simply queue a
5571 	 * post instead of doing it serially, we cannot assume anything
5572 	 * about the 'node' after ibd_post_send() returns.
5573 	 */
5574 	ibd_post_send(state, node);
5575 
5576 	return (B_TRUE);
5577 
5578 ibd_send_fail:
5579 	if (node && mp)
5580 		ibd_free_lsohdr(node, mp);
5581 
5582 	if (dofree)
5583 		freemsg(mp);
5584 
5585 	if (node != NULL)
5586 		ibd_tx_cleanup(state, node);
5587 
5588 	return (rc);
5589 }
5590 
5591 /*
5592  * GLDv3 entry point for transmitting datagram.
5593  */
5594 static mblk_t *
5595 ibd_m_tx(void *arg, mblk_t *mp)
5596 {
5597 	ibd_state_t *state = (ibd_state_t *)arg;
5598 	mblk_t *next;
5599 
5600 	while (mp != NULL) {
5601 		next = mp->b_next;
5602 		mp->b_next = NULL;
5603 		if (ibd_send(state, mp) == B_FALSE) {
5604 			/* Send fail */
5605 			mp->b_next = next;
5606 			break;
5607 		}
5608 		mp = next;
5609 	}
5610 
5611 	return (mp);
5612 }
5613 
5614 /*
5615  * this handles Tx and Rx completions. With separate CQs, this handles
5616  * only Rx completions.
5617  */
5618 static uint_t
5619 ibd_intr(char *arg)
5620 {
5621 	ibd_state_t *state = (ibd_state_t *)arg;
5622 
5623 	ibd_poll_compq(state, state->id_rcq_hdl);
5624 
5625 	return (DDI_INTR_CLAIMED);
5626 }
5627 
5628 /*
5629  * Poll and drain the cq
5630  */
5631 static uint_t
5632 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
5633     uint_t numwcs)
5634 {
5635 	ibd_wqe_t *wqe;
5636 	ibt_wc_t *wc;
5637 	uint_t total_polled = 0;
5638 	uint_t num_polled;
5639 	int i;
5640 
5641 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5642 		total_polled += num_polled;
5643 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5644 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5645 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5646 			    (wqe->w_type == IBD_WQE_RECV));
5647 			if (wc->wc_status != IBT_WC_SUCCESS) {
5648 				/*
5649 				 * Channel being torn down.
5650 				 */
5651 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5652 					DPRINT(5, "ibd_drain_cq: flush error");
5653 					/*
5654 					 * Only invoke the Tx handler to
5655 					 * release possibly held resources
5656 					 * like AH refcount etc. Can not
5657 					 * invoke Rx handler because it might
5658 					 * try adding buffers to the Rx pool
5659 					 * when we are trying to deinitialize.
5660 					 */
5661 					if (wqe->w_type == IBD_WQE_RECV) {
5662 						continue;
5663 					} else {
5664 						DPRINT(10, "ibd_drain_cq: Bad "
5665 						    "status %d", wc->wc_status);
5666 					}
5667 				}
5668 			}
5669 			if (wqe->w_type == IBD_WQE_SEND) {
5670 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
5671 			} else {
5672 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5673 			}
5674 		}
5675 	}
5676 
5677 	return (total_polled);
5678 }
5679 
5680 /*
5681  * Common code for interrupt handling as well as for polling
5682  * for all completed wqe's while detaching.
5683  */
5684 static void
5685 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5686 {
5687 	ibt_wc_t *wcs;
5688 	uint_t numwcs;
5689 	int flag, redo_flag;
5690 	int redo = 1;
5691 	uint_t num_polled = 0;
5692 
5693 	if (ibd_separate_cqs == 1) {
5694 		if (cq_hdl == state->id_rcq_hdl) {
5695 			flag = IBD_RX_CQ_POLLING;
5696 			redo_flag = IBD_REDO_RX_CQ_POLLING;
5697 		} else {
5698 			flag = IBD_TX_CQ_POLLING;
5699 			redo_flag = IBD_REDO_TX_CQ_POLLING;
5700 		}
5701 	} else {
5702 		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
5703 		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
5704 	}
5705 
5706 	mutex_enter(&state->id_cq_poll_lock);
5707 	if (state->id_cq_poll_busy & flag) {
5708 		state->id_cq_poll_busy |= redo_flag;
5709 		mutex_exit(&state->id_cq_poll_lock);
5710 		return;
5711 	}
5712 	state->id_cq_poll_busy |= flag;
5713 	mutex_exit(&state->id_cq_poll_lock);
5714 
5715 	/*
5716 	 * In some cases (eg detaching), this code can be invoked on
5717 	 * any cpu after disabling cq notification (thus no concurrency
5718 	 * exists). Apart from that, the following applies normally:
5719 	 * The receive completion handling is always on the Rx interrupt
5720 	 * cpu. Transmit completion handling could be from any cpu if
5721 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5722 	 * is interrupt driven. Combined completion handling is always
5723 	 * on the interrupt cpu. Thus, lock accordingly and use the
5724 	 * proper completion array.
5725 	 */
5726 	if (ibd_separate_cqs == 1) {
5727 		if (cq_hdl == state->id_rcq_hdl) {
5728 			wcs = state->id_rxwcs;
5729 			numwcs = state->id_rxwcs_size;
5730 		} else {
5731 			wcs = state->id_txwcs;
5732 			numwcs = state->id_txwcs_size;
5733 		}
5734 	} else {
5735 		wcs = state->id_rxwcs;
5736 		numwcs = state->id_rxwcs_size;
5737 	}
5738 
5739 	/*
5740 	 * Poll and drain the CQ
5741 	 */
5742 	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5743 
5744 	/*
5745 	 * Enable CQ notifications and redrain the cq to catch any
5746 	 * completions we might have missed after the ibd_drain_cq()
5747 	 * above and before the ibt_enable_cq_notify() that follows.
5748 	 * Finally, service any new requests to poll the cq that
5749 	 * could've come in after the ibt_enable_cq_notify().
5750 	 */
5751 	do {
5752 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
5753 		    IBT_SUCCESS) {
5754 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5755 		}
5756 
5757 		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5758 
5759 		mutex_enter(&state->id_cq_poll_lock);
5760 		if (state->id_cq_poll_busy & redo_flag)
5761 			state->id_cq_poll_busy &= ~redo_flag;
5762 		else {
5763 			state->id_cq_poll_busy &= ~flag;
5764 			redo = 0;
5765 		}
5766 		mutex_exit(&state->id_cq_poll_lock);
5767 
5768 	} while (redo);
5769 
5770 	/*
5771 	 * If we polled the receive cq and found anything, we need to flush
5772 	 * it out to the nw layer here.
5773 	 */
5774 	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
5775 		ibd_flush_rx(state, NULL);
5776 	}
5777 }
5778 
5779 /*
5780  * Unmap the memory area associated with a given swqe.
5781  */
5782 static void
5783 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
5784 {
5785 	ibt_status_t stat;
5786 
5787 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
5788 
5789 	if (swqe->w_mi_hdl) {
5790 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
5791 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
5792 			DPRINT(10,
5793 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
5794 		}
5795 		swqe->w_mi_hdl = NULL;
5796 	}
5797 	swqe->w_swr.wr_nds = 0;
5798 }
5799 
5800 /*
5801  * Common code that deals with clean ups after a successful or
5802  * erroneous transmission attempt.
5803  */
5804 static void
5805 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
5806 {
5807 	ibd_ace_t *ace = swqe->w_ahandle;
5808 
5809 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
5810 
5811 	/*
5812 	 * If this was a dynamic mapping in ibd_send(), we need to
5813 	 * unmap here. If this was an lso buffer we'd used for sending,
5814 	 * we need to release the lso buf to the pool, since the resource
5815 	 * is scarce. However, if this was simply a normal send using
5816 	 * the copybuf (present in each swqe), we don't need to release it.
5817 	 */
5818 	if (swqe->swqe_im_mblk != NULL) {
5819 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
5820 			ibd_unmap_mem(state, swqe);
5821 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
5822 			ibd_release_lsobufs(state,
5823 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
5824 		}
5825 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
5826 		freemsg(swqe->swqe_im_mblk);
5827 		swqe->swqe_im_mblk = NULL;
5828 	}
5829 
5830 	/*
5831 	 * Drop the reference count on the AH; it can be reused
5832 	 * now for a different destination if there are no more
5833 	 * posted sends that will use it. This can be eliminated
5834 	 * if we can always associate each Tx buffer with an AH.
5835 	 * The ace can be null if we are cleaning up from the
5836 	 * ibd_send() error path.
5837 	 */
5838 	if (ace != NULL) {
5839 		/*
5840 		 * The recycling logic can be eliminated from here
5841 		 * and put into the async thread if we create another
5842 		 * list to hold ACE's for unjoined mcg's.
5843 		 */
5844 		if (DEC_REF_DO_CYCLE(ace)) {
5845 			ibd_mce_t *mce;
5846 
5847 			/*
5848 			 * Check with the lock taken: we decremented
5849 			 * reference count without the lock, and some
5850 			 * transmitter might alreay have bumped the
5851 			 * reference count (possible in case of multicast
5852 			 * disable when we leave the AH on the active
5853 			 * list). If not still 0, get out, leaving the
5854 			 * recycle bit intact.
5855 			 *
5856 			 * Atomically transition the AH from active
5857 			 * to free list, and queue a work request to
5858 			 * leave the group and destroy the mce. No
5859 			 * transmitter can be looking at the AH or
5860 			 * the MCE in between, since we have the
5861 			 * ac_mutex lock. In the SendOnly reap case,
5862 			 * it is not neccesary to hold the ac_mutex
5863 			 * and recheck the ref count (since the AH was
5864 			 * taken off the active list), we just do it
5865 			 * to have uniform processing with the Full
5866 			 * reap case.
5867 			 */
5868 			mutex_enter(&state->id_ac_mutex);
5869 			mce = ace->ac_mce;
5870 			if (GET_REF_CYCLE(ace) == 0) {
5871 				CLEAR_REFCYCLE(ace);
5872 				/*
5873 				 * Identify the case of fullmember reap as
5874 				 * opposed to mcg trap reap. Also, port up
5875 				 * might set ac_mce to NULL to indicate Tx
5876 				 * cleanup should do no more than put the
5877 				 * AH in the free list (see ibd_async_link).
5878 				 */
5879 				if (mce != NULL) {
5880 					ace->ac_mce = NULL;
5881 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
5882 					/*
5883 					 * mc_req was initialized at mce
5884 					 * creation time.
5885 					 */
5886 					ibd_queue_work_slot(state,
5887 					    &mce->mc_req, IBD_ASYNC_REAP);
5888 				}
5889 				IBD_ACACHE_INSERT_FREE(state, ace);
5890 			}
5891 			mutex_exit(&state->id_ac_mutex);
5892 		}
5893 	}
5894 
5895 	/*
5896 	 * Release the send wqe for reuse.
5897 	 */
5898 	ibd_release_swqe(state, swqe);
5899 }
5900 
5901 /*
5902  * Hand off the processed rx mp chain to mac_rx()
5903  */
5904 static void
5905 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
5906 {
5907 	if (mpc == NULL) {
5908 		mutex_enter(&state->id_rx_lock);
5909 
5910 		mpc = state->id_rx_mp;
5911 
5912 		state->id_rx_mp = NULL;
5913 		state->id_rx_mp_tail = NULL;
5914 		state->id_rx_mp_len = 0;
5915 
5916 		mutex_exit(&state->id_rx_lock);
5917 	}
5918 
5919 	if (mpc) {
5920 		mac_rx(state->id_mh, state->id_rh, mpc);
5921 	}
5922 }
5923 
5924 /*
5925  * Processing to be done after receipt of a packet; hand off to GLD
5926  * in the format expected by GLD.  The received packet has this
5927  * format: 2b sap :: 00 :: data.
5928  */
5929 static void
5930 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
5931 {
5932 	ib_header_info_t *phdr;
5933 	mblk_t *mp;
5934 	mblk_t *mpc = NULL;
5935 	ipoib_hdr_t *ipibp;
5936 	ipha_t *iphap;
5937 	ip6_t *ip6h;
5938 	int rxcnt, len;
5939 
5940 	/*
5941 	 * Track number handed to upper layer, and number still
5942 	 * available to receive packets.
5943 	 */
5944 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
5945 	ASSERT(rxcnt >= 0);
5946 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
5947 
5948 	/*
5949 	 * Adjust write pointer depending on how much data came in.
5950 	 */
5951 	mp = rwqe->rwqe_im_mblk;
5952 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
5953 
5954 	/*
5955 	 * Make sure this is NULL or we're in trouble.
5956 	 */
5957 	if (mp->b_next != NULL) {
5958 		ibd_print_warn(state,
5959 		    "ibd_process_rx: got duplicate mp from rcq?");
5960 		mp->b_next = NULL;
5961 	}
5962 
5963 	/*
5964 	 * the IB link will deliver one of the IB link layer
5965 	 * headers called, the Global Routing Header (GRH).
5966 	 * ibd driver uses the information in GRH to build the
5967 	 * Header_info structure and pass it with the datagram up
5968 	 * to GLDv3.
5969 	 * If the GRH is not valid, indicate to GLDv3 by setting
5970 	 * the VerTcFlow field to 0.
5971 	 */
5972 	phdr = (ib_header_info_t *)mp->b_rptr;
5973 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
5974 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
5975 
5976 		/* if it is loop back packet, just drop it. */
5977 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
5978 		    IPOIB_ADDRL) == 0) {
5979 			freemsg(mp);
5980 			return;
5981 		}
5982 
5983 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
5984 		    sizeof (ipoib_mac_t));
5985 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
5986 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
5987 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
5988 		} else {
5989 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
5990 		}
5991 	} else {
5992 		/*
5993 		 * It can not be a IBA multicast packet. Must have been
5994 		 * unicast for us. Just copy the interface address to dst.
5995 		 */
5996 		phdr->ib_grh.ipoib_vertcflow = 0;
5997 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
5998 		    sizeof (ipoib_mac_t));
5999 	}
6000 
6001 	/*
6002 	 * For ND6 packets, padding is at the front of the source/target
6003 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6004 	 * the padding from such packets.
6005 	 */
6006 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6007 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
6008 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
6009 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6010 			    sizeof (ipoib_hdr_t))) {
6011 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
6012 				freemsg(mp);
6013 				return;
6014 			}
6015 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
6016 			    sizeof (ipoib_pgrh_t));
6017 		}
6018 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6019 		len = ntohs(ip6h->ip6_plen);
6020 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6021 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
6022 			    IPV6_HDR_LEN + len) {
6023 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
6024 				    IPV6_HDR_LEN + len)) {
6025 					DPRINT(10, "ibd_process_rx: pullupmsg"
6026 					    " failed");
6027 					freemsg(mp);
6028 					return;
6029 				}
6030 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6031 				    sizeof (ipoib_pgrh_t) +
6032 				    sizeof (ipoib_hdr_t));
6033 			}
6034 			/* LINTED: E_CONSTANT_CONDITION */
6035 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6036 		}
6037 	}
6038 
6039 	/*
6040 	 * Update statistics
6041 	 */
6042 	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
6043 	atomic_inc_64(&state->id_rcv_pkt);
6044 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6045 		atomic_inc_64(&state->id_brd_rcv);
6046 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6047 		atomic_inc_64(&state->id_multi_rcv);
6048 
6049 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6050 	/*
6051 	 * Set receive checksum status in mp
6052 	 * Hardware checksumming can be considered valid only if:
6053 	 * 1. CQE.IP_OK bit is set
6054 	 * 2. CQE.CKSUM = 0xffff
6055 	 * 3. IPv6 routing header is not present in the packet
6056 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6057 	 */
6058 
6059 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6060 	    (wc->wc_cksum == 0xFFFF) &&
6061 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6062 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6063 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6064 	}
6065 
6066 	/*
6067 	 * Add this mp to the list of processed mp's to send to
6068 	 * the nw layer
6069 	 */
6070 	mutex_enter(&state->id_rx_lock);
6071 	if (state->id_rx_mp) {
6072 		ASSERT(state->id_rx_mp_tail != NULL);
6073 		state->id_rx_mp_tail->b_next = mp;
6074 	} else {
6075 		ASSERT(state->id_rx_mp_tail == NULL);
6076 		state->id_rx_mp = mp;
6077 	}
6078 
6079 	state->id_rx_mp_tail = mp;
6080 	state->id_rx_mp_len++;
6081 
6082 	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
6083 		mpc = state->id_rx_mp;
6084 
6085 		state->id_rx_mp = NULL;
6086 		state->id_rx_mp_tail = NULL;
6087 		state->id_rx_mp_len = 0;
6088 	}
6089 
6090 	mutex_exit(&state->id_rx_lock);
6091 
6092 	if (mpc) {
6093 		ibd_flush_rx(state, mpc);
6094 	}
6095 }
6096 
6097 /*
6098  * Callback code invoked from STREAMs when the receive data buffer is
6099  * free for recycling.
6100  */
6101 static void
6102 ibd_freemsg_cb(char *arg)
6103 {
6104 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6105 	ibd_state_t *state = rwqe->w_state;
6106 
6107 	/*
6108 	 * If the wqe is being destructed, do not attempt recycling.
6109 	 */
6110 	if (rwqe->w_freeing_wqe == B_TRUE) {
6111 		DPRINT(6, "ibd_freemsg: wqe being freed");
6112 		return;
6113 	} else {
6114 		/*
6115 		 * Upper layer has released held mblk, so we have
6116 		 * no more use for keeping the old pointer in
6117 		 * our rwqe.
6118 		 */
6119 		rwqe->rwqe_im_mblk = NULL;
6120 	}
6121 
6122 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6123 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6124 	if (rwqe->rwqe_im_mblk == NULL) {
6125 		ibd_delete_rwqe(state, rwqe);
6126 		ibd_free_rwqe(state, rwqe);
6127 		DPRINT(6, "ibd_freemsg: desballoc failed");
6128 		return;
6129 	}
6130 
6131 	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
6132 		ibd_delete_rwqe(state, rwqe);
6133 		ibd_free_rwqe(state, rwqe);
6134 		return;
6135 	}
6136 
6137 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6138 }
6139 
6140 static uint_t
6141 ibd_tx_recycle(char *arg)
6142 {
6143 	ibd_state_t *state = (ibd_state_t *)arg;
6144 
6145 	/*
6146 	 * Poll for completed entries
6147 	 */
6148 	ibd_poll_compq(state, state->id_scq_hdl);
6149 
6150 	/*
6151 	 * Resume any blocked transmissions if possible
6152 	 */
6153 	(void) ibd_resume_transmission(state);
6154 
6155 	return (DDI_INTR_CLAIMED);
6156 }
6157 
6158 #ifdef IBD_LOGGING
6159 static void
6160 ibd_log_init(void)
6161 {
6162 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6163 	ibd_lbuf_ndx = 0;
6164 }
6165 
6166 static void
6167 ibd_log_fini(void)
6168 {
6169 	if (ibd_lbuf)
6170 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6171 	ibd_lbuf_ndx = 0;
6172 	ibd_lbuf = NULL;
6173 }
6174 
6175 static void
6176 ibd_log(const char *fmt, ...)
6177 {
6178 	va_list	ap;
6179 	uint32_t off;
6180 	uint32_t msglen;
6181 	char tmpbuf[IBD_DMAX_LINE];
6182 
6183 	if (ibd_lbuf == NULL)
6184 		return;
6185 
6186 	va_start(ap, fmt);
6187 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6188 	va_end(ap);
6189 
6190 	if (msglen >= IBD_DMAX_LINE)
6191 		msglen = IBD_DMAX_LINE - 1;
6192 
6193 	mutex_enter(&ibd_lbuf_lock);
6194 
6195 	off = ibd_lbuf_ndx;		/* current msg should go here */
6196 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6197 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6198 
6199 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6200 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6201 
6202 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6203 		ibd_lbuf_ndx = 0;
6204 
6205 	mutex_exit(&ibd_lbuf_lock);
6206 
6207 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6208 }
6209 #endif
6210