xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_ring.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2023 The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 #include <sys/sdt.h>
29 
30 #include <sys/pattr.h>
31 #include <sys/dlpi.h>
32 
33 #include <sys/mac_provider.h>
34 
35 #include <sys/random.h>
36 
37 #include <mlxcx.h>
38 
39 boolean_t
40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
41 {
42 	ddi_device_acc_attr_t acc;
43 	ddi_dma_attr_t attr;
44 	boolean_t ret;
45 	size_t sz;
46 
47 	VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
48 
49 	/* Receive and send queue entries might be different sizes. */
50 	switch (mlwq->mlwq_type) {
51 	case MLXCX_WQ_TYPE_SENDQ:
52 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
53 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
54 		sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
55 		break;
56 	case MLXCX_WQ_TYPE_RECVQ:
57 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
58 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
59 		sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
60 		break;
61 	default:
62 		VERIFY(0);
63 		return (B_FALSE);
64 	}
65 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
66 
67 	mlxcx_dma_acc_attr(mlxp, &acc);
68 	mlxcx_dma_queue_attr(mlxp, &attr);
69 
70 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
71 	    B_TRUE, sz, B_TRUE);
72 	if (!ret) {
73 		mlxcx_warn(mlxp, "failed to allocate WQ memory");
74 		return (B_FALSE);
75 	}
76 
77 	/*
78 	 * Just set the first pointer in the union. Yes, this is a strict
79 	 * aliasing violation. No, I don't care.
80 	 */
81 	mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
82 
83 	mlxcx_dma_acc_attr(mlxp, &acc);
84 	mlxcx_dma_qdbell_attr(mlxp, &attr);
85 	sz = sizeof (mlxcx_workq_doorbell_t);
86 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
87 	    B_TRUE, sz, B_TRUE);
88 	if (!ret) {
89 		mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
90 		mlxcx_dma_free(&mlwq->mlwq_dma);
91 		mlwq->mlwq_send_ent = NULL;
92 		return (B_FALSE);
93 	}
94 
95 	mlwq->mlwq_doorbell =
96 	    (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
97 
98 	mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
99 
100 	return (B_TRUE);
101 }
102 
103 void
104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
105 {
106 	VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
107 	if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
108 		VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
109 
110 	mlxcx_dma_free(&mlwq->mlwq_dma);
111 	mlwq->mlwq_send_ent = NULL;
112 	mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
113 	mlwq->mlwq_doorbell = NULL;
114 
115 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
116 }
117 
118 static boolean_t
119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
120     uint_t ent_shift)
121 {
122 	ddi_device_acc_attr_t acc;
123 	ddi_dma_attr_t attr;
124 	boolean_t ret;
125 	size_t sz, i;
126 
127 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
128 
129 	mlcq->mlcq_entshift = ent_shift;
130 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
131 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
132 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
133 
134 	mlxcx_dma_acc_attr(mlxp, &acc);
135 	mlxcx_dma_queue_attr(mlxp, &attr);
136 
137 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
138 	    B_TRUE, sz, B_TRUE);
139 	if (!ret) {
140 		mlxcx_warn(mlxp, "failed to allocate CQ memory");
141 		return (B_FALSE);
142 	}
143 
144 	mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
145 
146 	for (i = 0; i < mlcq->mlcq_nents; ++i) {
147 		mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
148 		mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
149 	}
150 
151 	mlxcx_dma_acc_attr(mlxp, &acc);
152 	mlxcx_dma_qdbell_attr(mlxp, &attr);
153 	sz = sizeof (mlxcx_completionq_doorbell_t);
154 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
155 	    B_TRUE, sz, B_TRUE);
156 	if (!ret) {
157 		mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
158 		mlxcx_dma_free(&mlcq->mlcq_dma);
159 		mlcq->mlcq_ent = NULL;
160 		return (B_FALSE);
161 	}
162 
163 	mlcq->mlcq_doorbell =
164 	    (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
165 
166 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC);
167 
168 	return (B_TRUE);
169 }
170 
171 static void
172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
173 {
174 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
175 	if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
176 		VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177 
178 	mlxcx_dma_free(&mlcq->mlcq_dma);
179 	mlcq->mlcq_ent = NULL;
180 	mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
181 	mlcq->mlcq_doorbell = NULL;
182 
183 	atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC);
184 }
185 
186 void
187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
188 {
189 	mlxcx_completion_queue_t *mlcq;
190 
191 	/*
192 	 * If something is holding the lock on a long operation like a
193 	 * refill, setting this flag asks them to exit early if possible.
194 	 */
195 	atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
196 
197 	mutex_enter(&mlwq->mlwq_mtx);
198 
199 	list_remove(&mlxp->mlx_wqs, mlwq);
200 
201 	if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
202 	    !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
203 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
204 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
205 		    !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
206 			mlxcx_warn(mlxp, "failed to stop "
207 			    "recv queue num %x", mlwq->mlwq_num);
208 		}
209 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
210 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
211 		    !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
212 			mlxcx_warn(mlxp, "failed to stop "
213 			    "send queue num %x", mlwq->mlwq_num);
214 		}
215 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
216 		    !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
217 			mlxcx_warn(mlxp, "failed to destroy "
218 			    "recv queue num %x", mlwq->mlwq_num);
219 		}
220 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
221 		    !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
222 			mlxcx_warn(mlxp, "failed to destroy "
223 			    "send queue num %x", mlwq->mlwq_num);
224 		}
225 	}
226 	if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
227 		mlxcx_wq_rele_dma(mlxp, mlwq);
228 	}
229 	mlcq = mlwq->mlwq_cq;
230 
231 	/* These will be released by mlxcx_teardown_bufs() */
232 	mlwq->mlwq_bufs = NULL;
233 	mlwq->mlwq_foreign_bufs = NULL;
234 
235 	mutex_exit(&mlwq->mlwq_mtx);
236 
237 	mutex_enter(&mlcq->mlcq_mtx);
238 	mutex_enter(&mlwq->mlwq_mtx);
239 	ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
240 	mlcq->mlcq_wq = NULL;
241 	mutex_exit(&mlwq->mlwq_mtx);
242 	mutex_exit(&mlcq->mlcq_mtx);
243 
244 	mutex_destroy(&mlwq->mlwq_mtx);
245 }
246 
247 void
248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
249 {
250 	mlxcx_event_queue_t *mleq;
251 	mlxcx_buffer_t *b;
252 
253 	/*
254 	 * If something is holding the lock on a long operation like polling
255 	 * which we're going to abort anyway, this flag asks them to exit
256 	 * early if possible.
257 	 */
258 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
259 
260 	mutex_enter(&mlcq->mlcq_mtx);
261 
262 	list_remove(&mlxp->mlx_cqs, mlcq);
263 
264 	if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
265 	    !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
266 		if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
267 			mlxcx_warn(mlxp, "failed to destroy "
268 			    "completion queue num %u",
269 			    mlcq->mlcq_num);
270 		}
271 	}
272 	if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
273 		mlxcx_cq_rele_dma(mlxp, mlcq);
274 	}
275 	/*
276 	 * If we're on an EQ AVL tree, then we need to grab
277 	 * the EQ's mutex to take it off. The ISR always takes
278 	 * EQ mutex before CQ mutex, so we have to let go of
279 	 * the CQ mutex then come back again.
280 	 *
281 	 * The ISR will bail out if tries to touch this CQ now since
282 	 * we added the CQ_DESTROYED flag above.
283 	 */
284 	if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
285 		mleq = mlcq->mlcq_eq;
286 	} else {
287 		mleq = NULL;
288 	}
289 
290 	/* Return any outstanding buffers to the free pool. */
291 	while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
292 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
293 	}
294 	mutex_enter(&mlcq->mlcq_bufbmtx);
295 	while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
296 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
297 	}
298 	mutex_exit(&mlcq->mlcq_bufbmtx);
299 
300 	/*
301 	 * Since the interrupt handlers take the EQ lock before the CQ one,
302 	 * we must do the same here. That means letting go of the lock
303 	 * for a brief window here (we'll double-check the state when we
304 	 * get back in).
305 	 */
306 	mutex_exit(&mlcq->mlcq_mtx);
307 
308 	if (mleq != NULL) {
309 		mutex_enter(&mleq->mleq_mtx);
310 		mutex_enter(&mlcq->mlcq_mtx);
311 		/*
312 		 * Double-check the state, we let go of the
313 		 * mutex briefly.
314 		 */
315 		if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
316 			avl_remove(&mleq->mleq_cqs, mlcq);
317 			atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL);
318 		}
319 		mutex_exit(&mlcq->mlcq_mtx);
320 		mutex_exit(&mleq->mleq_mtx);
321 	}
322 
323 	mutex_enter(&mlcq->mlcq_mtx);
324 	ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
325 	    MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
326 	mutex_exit(&mlcq->mlcq_mtx);
327 
328 	mutex_destroy(&mlcq->mlcq_mtx);
329 	mutex_destroy(&mlcq->mlcq_arm_mtx);
330 	mutex_destroy(&mlcq->mlcq_bufbmtx);
331 	list_destroy(&mlcq->mlcq_buffers);
332 	list_destroy(&mlcq->mlcq_buffers_b);
333 	kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
334 }
335 
336 static boolean_t
337 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
338     mlxcx_completion_queue_t **cqp, uint_t ent_shift)
339 {
340 	mlxcx_completion_queue_t *cq;
341 
342 	cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
343 	mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
344 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
345 	mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER,
346 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
347 	mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
348 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
349 	list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
350 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
351 	list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
352 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
353 
354 	cq->mlcq_mlx = mlxp;
355 	list_insert_tail(&mlxp->mlx_cqs, cq);
356 
357 	mutex_enter(&cq->mlcq_mtx);
358 
359 	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
360 		mutex_exit(&cq->mlcq_mtx);
361 		return (B_FALSE);
362 	}
363 
364 	cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
365 	cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
366 
367 	cq->mlcq_uar = &mlxp->mlx_uar;
368 	cq->mlcq_eq = eq;
369 
370 	cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
371 	cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
372 
373 	if (!mlxcx_cmd_create_cq(mlxp, cq)) {
374 		mutex_exit(&cq->mlcq_mtx);
375 		return (B_FALSE);
376 	}
377 
378 	mutex_exit(&cq->mlcq_mtx);
379 
380 	mutex_enter(&eq->mleq_mtx);
381 	mutex_enter(&cq->mlcq_arm_mtx);
382 	mutex_enter(&cq->mlcq_mtx);
383 	ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
384 	avl_add(&eq->mleq_cqs, cq);
385 	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL);
386 	mlxcx_arm_cq(mlxp, cq);
387 	mutex_exit(&cq->mlcq_mtx);
388 	mutex_exit(&cq->mlcq_arm_mtx);
389 	mutex_exit(&eq->mleq_mtx);
390 
391 	*cqp = cq;
392 	return (B_TRUE);
393 }
394 
395 static boolean_t
396 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
397     mlxcx_work_queue_t *wq)
398 {
399 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
400 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
401 
402 	list_insert_tail(&mlxp->mlx_wqs, wq);
403 
404 	mutex_enter(&wq->mlwq_mtx);
405 
406 	wq->mlwq_mlx = mlxp;
407 	wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
408 	wq->mlwq_cq = cq;
409 	wq->mlwq_pd = &mlxp->mlx_pd;
410 	wq->mlwq_uar = &mlxp->mlx_uar;
411 
412 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
413 
414 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
415 		mutex_exit(&wq->mlwq_mtx);
416 		return (B_FALSE);
417 	}
418 
419 	if (!mlxcx_cmd_create_rq(mlxp, wq)) {
420 		mutex_exit(&wq->mlwq_mtx);
421 		return (B_FALSE);
422 	}
423 
424 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
425 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
426 
427 	mutex_exit(&wq->mlwq_mtx);
428 
429 	mutex_enter(&cq->mlcq_mtx);
430 	mutex_enter(&wq->mlwq_mtx);
431 	ASSERT3P(cq->mlcq_wq, ==, NULL);
432 	cq->mlcq_wq = wq;
433 	mutex_exit(&wq->mlwq_mtx);
434 	mutex_exit(&cq->mlcq_mtx);
435 
436 	return (B_TRUE);
437 }
438 
439 static boolean_t
440 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
441     mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
442 {
443 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
444 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
445 
446 	list_insert_tail(&mlxp->mlx_wqs, wq);
447 
448 	mutex_enter(&wq->mlwq_mtx);
449 
450 	wq->mlwq_mlx = mlxp;
451 	wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
452 	wq->mlwq_cq = cq;
453 	wq->mlwq_pd = &mlxp->mlx_pd;
454 	wq->mlwq_uar = &mlxp->mlx_uar;
455 	wq->mlwq_tis = tis;
456 
457 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
458 	wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
459 
460 	VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
461 	wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
462 
463 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
464 		mutex_exit(&wq->mlwq_mtx);
465 		return (B_FALSE);
466 	}
467 
468 	if (!mlxcx_cmd_create_sq(mlxp, wq)) {
469 		mutex_exit(&wq->mlwq_mtx);
470 		return (B_FALSE);
471 	}
472 
473 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
474 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
475 
476 	mutex_exit(&wq->mlwq_mtx);
477 
478 	mutex_enter(&cq->mlcq_mtx);
479 	mutex_enter(&wq->mlwq_mtx);
480 	ASSERT3P(cq->mlcq_wq, ==, NULL);
481 	cq->mlcq_wq = wq;
482 	mutex_exit(&wq->mlwq_mtx);
483 	mutex_exit(&cq->mlcq_mtx);
484 
485 	return (B_TRUE);
486 }
487 
488 /*
489  * Before we tear down the queues associated with the rx group,
490  * flag each cq as being torn down and wake up any tasks.
491  */
492 static void
493 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
494 {
495 	mlxcx_work_queue_t *wq;
496 	mlxcx_completion_queue_t *cq;
497 	mlxcx_buf_shard_t *s;
498 	uint_t i;
499 
500 	mutex_enter(&g->mlg_mtx);
501 
502 	for (i = 0; i < g->mlg_nwqs; ++i) {
503 		wq = &g->mlg_wqs[i];
504 		cq = wq->mlwq_cq;
505 		if (cq != NULL) {
506 			s = wq->mlwq_bufs;
507 			mutex_enter(&s->mlbs_mtx);
508 			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
509 			cv_broadcast(&s->mlbs_free_nonempty);
510 			mutex_exit(&s->mlbs_mtx);
511 		}
512 	}
513 
514 	mutex_exit(&g->mlg_mtx);
515 }
516 
517 void
518 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
519 {
520 	mlxcx_work_queue_t *wq;
521 	mlxcx_completion_queue_t *cq;
522 	mlxcx_flow_entry_t *fe;
523 	mlxcx_flow_group_t *fg;
524 	mlxcx_flow_table_t *ft;
525 	uint_t i;
526 
527 	mutex_enter(&g->mlg_port->mlp_mtx);
528 	mutex_enter(&g->mlg_mtx);
529 
530 	if (g->mlg_state & MLXCX_GROUP_FLOWS) {
531 		mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
532 
533 		if (g->mlg_rx_vlan_ft != NULL)
534 			mlxcx_remove_all_vlan_entries(mlxp, g);
535 
536 		if (g == &mlxp->mlx_rx_groups[0]) {
537 			ft = g->mlg_port->mlp_rx_flow;
538 			mutex_enter(&ft->mlft_mtx);
539 
540 			fg = g->mlg_port->mlp_bcast;
541 			fe = list_head(&fg->mlfg_entries);
542 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
543 				(void) mlxcx_cmd_delete_flow_table_entry(
544 				    mlxp, fe);
545 			}
546 
547 			fg = g->mlg_port->mlp_promisc;
548 			fe = list_head(&fg->mlfg_entries);
549 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
550 				(void) mlxcx_cmd_delete_flow_table_entry(
551 				    mlxp, fe);
552 			}
553 
554 			mutex_exit(&ft->mlft_mtx);
555 		}
556 
557 		if (g->mlg_rx_vlan_ft != NULL) {
558 			mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
559 			ASSERT(list_is_empty(&g->mlg_rx_vlans));
560 			fg = g->mlg_rx_vlan_def_fg;
561 			if (fg != NULL) {
562 				fe = list_head(&fg->mlfg_entries);
563 				if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
564 					(void)
565 					    mlxcx_cmd_delete_flow_table_entry(
566 					    mlxp, fe);
567 				}
568 			}
569 			fg = g->mlg_rx_vlan_promisc_fg;
570 			if (fg != NULL) {
571 				fe = list_head(&fg->mlfg_entries);
572 				if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
573 					(void)
574 					    mlxcx_cmd_delete_flow_table_entry(
575 					    mlxp, fe);
576 				}
577 			}
578 			mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
579 			list_destroy(&g->mlg_rx_vlans);
580 
581 			g->mlg_rx_vlan_ft = NULL;
582 		}
583 
584 		mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
585 		mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
586 		g->mlg_rx_hash_ft = NULL;
587 
588 		avl_destroy(&g->mlg_rx_macs);
589 		g->mlg_state &= ~MLXCX_GROUP_FLOWS;
590 	}
591 
592 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
593 		for (i = 0; i < g->mlg_nwqs; ++i) {
594 			wq = &g->mlg_wqs[i];
595 			mutex_enter(&wq->mlwq_mtx);
596 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
597 			    !mlxcx_cmd_stop_rq(mlxp, wq)) {
598 				mlxcx_warn(mlxp, "failed to stop rq %x",
599 				    wq->mlwq_num);
600 			}
601 			mutex_exit(&wq->mlwq_mtx);
602 		}
603 		taskq_destroy(g->mlg_refill_tq);
604 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
605 	}
606 
607 	if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
608 		for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
609 			mlxcx_tir_t *tir = &g->mlg_tir[i];
610 			if (tir->mltir_state & MLXCX_TIR_CREATED &&
611 			    !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
612 				if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
613 					mlxcx_warn(mlxp,
614 					    "failed to destroy tir %u "
615 					    "for rx ring", tir->mltir_num);
616 				}
617 			}
618 		}
619 		g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
620 	}
621 
622 	if (g->mlg_state & MLXCX_GROUP_RQT) {
623 		if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
624 		    !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
625 			if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
626 				mlxcx_warn(mlxp, "failed to destroy rqt %u "
627 				    "for rx ring", g->mlg_rqt->mlrqt_num);
628 			}
629 			kmem_free(g->mlg_rqt->mlrqt_rq,
630 			    g->mlg_rqt->mlrqt_rq_size);
631 			g->mlg_rqt->mlrqt_rq = NULL;
632 			kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
633 			g->mlg_rqt = NULL;
634 		}
635 		g->mlg_state &= ~MLXCX_GROUP_RQT;
636 	}
637 
638 	for (i = 0; i < g->mlg_nwqs; ++i) {
639 		wq = &g->mlg_wqs[i];
640 		cq = wq->mlwq_cq;
641 		mlxcx_wq_teardown(mlxp, wq);
642 		if (cq != NULL)
643 			mlxcx_cq_teardown(mlxp, cq);
644 	}
645 	kmem_free(g->mlg_wqs, g->mlg_wqs_size);
646 	g->mlg_wqs = NULL;
647 	g->mlg_state &= ~MLXCX_GROUP_WQS;
648 
649 	mutex_exit(&g->mlg_mtx);
650 	mutex_exit(&g->mlg_port->mlp_mtx);
651 
652 	mutex_destroy(&g->mlg_mtx);
653 
654 	g->mlg_state &= ~MLXCX_GROUP_INIT;
655 	ASSERT3S(g->mlg_state, ==, 0);
656 }
657 
658 void
659 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
660 {
661 	mlxcx_work_queue_t *wq;
662 	mlxcx_completion_queue_t *cq;
663 	uint_t i;
664 
665 	mutex_enter(&g->mlg_mtx);
666 
667 	if (g->mlg_state & MLXCX_GROUP_WQS) {
668 		for (i = 0; i < g->mlg_nwqs; ++i) {
669 			wq = &g->mlg_wqs[i];
670 			mutex_enter(&wq->mlwq_mtx);
671 			cq = wq->mlwq_cq;
672 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
673 			    !mlxcx_cmd_stop_sq(mlxp, wq)) {
674 				mlxcx_warn(mlxp, "failed to stop sq %x",
675 				    wq->mlwq_num);
676 			}
677 			mutex_exit(&wq->mlwq_mtx);
678 			mlxcx_wq_teardown(mlxp, wq);
679 			if (cq != NULL)
680 				mlxcx_cq_teardown(mlxp, cq);
681 		}
682 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
683 		kmem_free(g->mlg_wqs, g->mlg_wqs_size);
684 		g->mlg_wqs = NULL;
685 		g->mlg_state &= ~MLXCX_GROUP_WQS;
686 	}
687 
688 	if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
689 	    g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
690 	    !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
691 		if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
692 			mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
693 			    g->mlg_tis.mltis_num);
694 		}
695 	}
696 	g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
697 
698 	mutex_exit(&g->mlg_mtx);
699 	mutex_destroy(&g->mlg_mtx);
700 	g->mlg_state &= ~MLXCX_GROUP_INIT;
701 	ASSERT3S(g->mlg_state, ==, 0);
702 }
703 
704 void
705 mlxcx_teardown_groups(mlxcx_t *mlxp)
706 {
707 	mlxcx_ring_group_t *g;
708 	uint_t i;
709 
710 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
711 		g = &mlxp->mlx_rx_groups[i];
712 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
713 			continue;
714 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
715 		mlxcx_quiesce_rx_cqs(mlxp, g);
716 	}
717 
718 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
719 		g = &mlxp->mlx_rx_groups[i];
720 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
721 			continue;
722 		mlxcx_teardown_rx_group(mlxp, g);
723 	}
724 
725 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
726 	mlxp->mlx_rx_groups = NULL;
727 
728 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
729 		g = &mlxp->mlx_tx_groups[i];
730 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
731 			continue;
732 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
733 		mlxcx_teardown_tx_group(mlxp, g);
734 	}
735 
736 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
737 	mlxp->mlx_tx_groups = NULL;
738 }
739 
740 boolean_t
741 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
742 {
743 	mlxcx_event_queue_t *eq;
744 	mlxcx_completion_queue_t *cq;
745 	mlxcx_work_queue_t *rq;
746 	mlxcx_flow_table_t *ft;
747 	mlxcx_flow_group_t *fg;
748 	mlxcx_flow_entry_t *fe;
749 	uint_t ent_shift;
750 	uint_t i, j;
751 
752 	ASSERT3S(g->mlg_state, ==, 0);
753 
754 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
755 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
756 	mutex_enter(&g->mlg_mtx);
757 	g->mlg_mlx = mlxp;
758 	g->mlg_type = MLXCX_GROUP_RX;
759 	g->mlg_port = &mlxp->mlx_ports[0];
760 	g->mlg_state |= MLXCX_GROUP_INIT;
761 
762 	g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
763 	i = g - &mlxp->mlx_rx_groups[0];
764 	if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
765 		g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
766 
767 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
768 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
769 	g->mlg_state |= MLXCX_GROUP_WQS;
770 
771 	g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
772 	g->mlg_rqt->mlrqt_max = 2;
773 	while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
774 		g->mlg_rqt->mlrqt_max <<= 1;
775 	g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
776 	    sizeof (mlxcx_work_queue_t *);
777 	g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
778 	g->mlg_state |= MLXCX_GROUP_RQT;
779 
780 	for (i = 0; i < g->mlg_nwqs; ++i) {
781 		eq = NULL;
782 		while (eq == NULL) {
783 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
784 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
785 				mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
786 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
787 			    eq->mleq_type != MLXCX_EQ_TYPE_RX) {
788 				/* Try the next one */
789 				eq = NULL;
790 			}
791 		}
792 
793 		/*
794 		 * A single completion is indicated for each rq entry as
795 		 * it is used. So, the number of cq entries never needs
796 		 * to be larger than the rq.
797 		 */
798 		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
799 		    mlxp->mlx_props.mldp_rq_size_shift);
800 		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
801 			g->mlg_nwqs = i;
802 			break;
803 		}
804 
805 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
806 
807 		rq = &g->mlg_wqs[i];
808 		if (!mlxcx_rq_setup(mlxp, cq, rq)) {
809 			g->mlg_nwqs = i;
810 			break;
811 		}
812 		g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
813 		g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
814 		rq->mlwq_group = g;
815 	}
816 	if (g->mlg_nwqs == 0) {
817 		mutex_exit(&g->mlg_mtx);
818 		return (B_FALSE);
819 	}
820 
821 	if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
822 		mutex_exit(&g->mlg_mtx);
823 		return (B_FALSE);
824 	}
825 
826 	for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
827 		mlxcx_tir_t *tir = &g->mlg_tir[i];
828 		tir->mltir_tdom = &mlxp->mlx_tdom;
829 		switch (i) {
830 		case MLXCX_TIR_ROLE_OTHER:
831 			tir->mltir_type = MLXCX_TIR_DIRECT;
832 			tir->mltir_rq = &g->mlg_wqs[0];
833 			break;
834 		case MLXCX_TIR_ROLE_IPv4:
835 		case MLXCX_TIR_ROLE_IPv6:
836 		case MLXCX_TIR_ROLE_TCPv4:
837 		case MLXCX_TIR_ROLE_TCPv6:
838 		case MLXCX_TIR_ROLE_UDPv4:
839 		case MLXCX_TIR_ROLE_UDPv6:
840 			tir->mltir_type = MLXCX_TIR_INDIRECT;
841 			tir->mltir_rqtable = g->mlg_rqt;
842 			tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
843 			(void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
844 			    sizeof (tir->mltir_toeplitz_key));
845 			break;
846 		}
847 		switch (i) {
848 		case MLXCX_TIR_ROLE_OTHER:
849 			break;
850 		case MLXCX_TIR_ROLE_IPv4:
851 		case MLXCX_TIR_ROLE_TCPv4:
852 		case MLXCX_TIR_ROLE_UDPv4:
853 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
854 			tir->mltir_hash_fields =
855 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
856 			break;
857 		case MLXCX_TIR_ROLE_IPv6:
858 		case MLXCX_TIR_ROLE_TCPv6:
859 		case MLXCX_TIR_ROLE_UDPv6:
860 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
861 			tir->mltir_hash_fields =
862 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
863 			break;
864 		}
865 		switch (i) {
866 		case MLXCX_TIR_ROLE_OTHER:
867 		case MLXCX_TIR_ROLE_IPv4:
868 		case MLXCX_TIR_ROLE_IPv6:
869 			break;
870 		case MLXCX_TIR_ROLE_TCPv4:
871 		case MLXCX_TIR_ROLE_TCPv6:
872 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
873 			tir->mltir_hash_fields |=
874 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
875 			break;
876 		case MLXCX_TIR_ROLE_UDPv4:
877 		case MLXCX_TIR_ROLE_UDPv6:
878 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
879 			tir->mltir_hash_fields |=
880 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
881 			break;
882 		}
883 
884 		if (!mlxcx_cmd_create_tir(mlxp, tir)) {
885 			mutex_exit(&g->mlg_mtx);
886 			return (B_FALSE);
887 		}
888 
889 		g->mlg_state |= MLXCX_GROUP_TIRTIS;
890 	}
891 
892 	/*
893 	 * Flow table: our RX hashing breakout table for RSS
894 	 */
895 
896 	g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
897 	    KM_SLEEP));
898 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
899 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
900 	avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
901 	    sizeof (mlxcx_group_mac_t),
902 	    offsetof(mlxcx_group_mac_t, mlgm_group_entry));
903 	g->mlg_state |= MLXCX_GROUP_FLOWS;
904 
905 	mutex_enter(&ft->mlft_mtx);
906 
907 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
908 	ft->mlft_level = 2;
909 	ft->mlft_port = g->mlg_port;
910 	ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
911 	ft->mlft_nents = (1 << ft->mlft_entshift);
912 	ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
913 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
914 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
915 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
916 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
917 
918 	for (j = 0; j < ft->mlft_nents; ++j) {
919 		ft->mlft_ent[j].mlfe_table = ft;
920 		ft->mlft_ent[j].mlfe_index = j;
921 	}
922 
923 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
924 		mutex_exit(&ft->mlft_mtx);
925 		mutex_exit(&g->mlg_mtx);
926 		return (B_FALSE);
927 	}
928 
929 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
930 	list_insert_tail(&ft->mlft_groups, fg);
931 	fg->mlfg_table = ft;
932 	fg->mlfg_size = 1;
933 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
934 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
935 		mutex_exit(&ft->mlft_mtx);
936 		mutex_exit(&g->mlg_mtx);
937 		return (B_FALSE);
938 	}
939 	fe = list_head(&fg->mlfg_entries);
940 	fe->mlfe_ip_version = 6;
941 	fe->mlfe_ip_proto = IPPROTO_UDP;
942 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
943 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
944 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
945 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
946 		mutex_exit(&ft->mlft_mtx);
947 		mutex_exit(&g->mlg_mtx);
948 		return (B_FALSE);
949 	}
950 
951 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
952 	list_insert_tail(&ft->mlft_groups, fg);
953 	fg->mlfg_table = ft;
954 	fg->mlfg_size = 1;
955 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
956 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
957 		mutex_exit(&ft->mlft_mtx);
958 		mutex_exit(&g->mlg_mtx);
959 		return (B_FALSE);
960 	}
961 	fe = list_head(&fg->mlfg_entries);
962 	fe->mlfe_ip_version = 4;
963 	fe->mlfe_ip_proto = IPPROTO_UDP;
964 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
965 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
966 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
967 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
968 		mutex_exit(&ft->mlft_mtx);
969 		mutex_exit(&g->mlg_mtx);
970 		return (B_FALSE);
971 	}
972 
973 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
974 	list_insert_tail(&ft->mlft_groups, fg);
975 	fg->mlfg_table = ft;
976 	fg->mlfg_size = 1;
977 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
978 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
979 		mutex_exit(&ft->mlft_mtx);
980 		mutex_exit(&g->mlg_mtx);
981 		return (B_FALSE);
982 	}
983 	fe = list_head(&fg->mlfg_entries);
984 	fe->mlfe_ip_version = 6;
985 	fe->mlfe_ip_proto = IPPROTO_TCP;
986 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
987 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
988 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
989 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
990 		mutex_exit(&ft->mlft_mtx);
991 		mutex_exit(&g->mlg_mtx);
992 		return (B_FALSE);
993 	}
994 
995 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
996 	list_insert_tail(&ft->mlft_groups, fg);
997 	fg->mlfg_table = ft;
998 	fg->mlfg_size = 1;
999 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
1000 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1001 		mutex_exit(&ft->mlft_mtx);
1002 		mutex_exit(&g->mlg_mtx);
1003 		return (B_FALSE);
1004 	}
1005 	fe = list_head(&fg->mlfg_entries);
1006 	fe->mlfe_ip_version = 4;
1007 	fe->mlfe_ip_proto = IPPROTO_TCP;
1008 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1009 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1010 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
1011 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1012 		mutex_exit(&ft->mlft_mtx);
1013 		mutex_exit(&g->mlg_mtx);
1014 		return (B_FALSE);
1015 	}
1016 
1017 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1018 	list_insert_tail(&ft->mlft_groups, fg);
1019 	fg->mlfg_table = ft;
1020 	fg->mlfg_size = 1;
1021 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1022 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1023 		mutex_exit(&ft->mlft_mtx);
1024 		mutex_exit(&g->mlg_mtx);
1025 		return (B_FALSE);
1026 	}
1027 	fe = list_head(&fg->mlfg_entries);
1028 	fe->mlfe_ip_version = 6;
1029 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1030 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1031 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
1032 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1033 		mutex_exit(&ft->mlft_mtx);
1034 		mutex_exit(&g->mlg_mtx);
1035 		return (B_FALSE);
1036 	}
1037 
1038 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1039 	list_insert_tail(&ft->mlft_groups, fg);
1040 	fg->mlfg_table = ft;
1041 	fg->mlfg_size = 1;
1042 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1043 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1044 		mutex_exit(&ft->mlft_mtx);
1045 		mutex_exit(&g->mlg_mtx);
1046 		return (B_FALSE);
1047 	}
1048 	fe = list_head(&fg->mlfg_entries);
1049 	fe->mlfe_ip_version = 4;
1050 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1051 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1052 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
1053 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1054 		mutex_exit(&ft->mlft_mtx);
1055 		mutex_exit(&g->mlg_mtx);
1056 		return (B_FALSE);
1057 	}
1058 
1059 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1060 	list_insert_tail(&ft->mlft_groups, fg);
1061 	fg->mlfg_table = ft;
1062 	fg->mlfg_size = 1;
1063 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1064 		mutex_exit(&ft->mlft_mtx);
1065 		mutex_exit(&g->mlg_mtx);
1066 		return (B_FALSE);
1067 	}
1068 	fe = list_head(&fg->mlfg_entries);
1069 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1070 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1071 	    &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1072 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1073 		mutex_exit(&ft->mlft_mtx);
1074 		mutex_exit(&g->mlg_mtx);
1075 		return (B_FALSE);
1076 	}
1077 
1078 	mutex_exit(&ft->mlft_mtx);
1079 
1080 	/*
1081 	 * Flow table: the VLAN breakout table for doing VLAN filtering after
1082 	 * we've matched a MAC address.
1083 	 */
1084 
1085 	g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1086 	    KM_SLEEP));
1087 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1088 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1089 	list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1090 	    offsetof(mlxcx_group_vlan_t, mlgv_entry));
1091 
1092 	mutex_enter(&ft->mlft_mtx);
1093 
1094 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1095 	ft->mlft_level = 1;
1096 	ft->mlft_port = g->mlg_port;
1097 	ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1098 	ft->mlft_nents = (1 << ft->mlft_entshift);
1099 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1100 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1101 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1102 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
1103 
1104 	for (j = 0; j < ft->mlft_nents; ++j) {
1105 		fe = &ft->mlft_ent[j];
1106 		fe->mlfe_table = ft;
1107 		fe->mlfe_index = j;
1108 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1109 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1110 	}
1111 
1112 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1113 		mutex_exit(&ft->mlft_mtx);
1114 		mutex_exit(&g->mlg_mtx);
1115 		return (B_FALSE);
1116 	}
1117 
1118 	/* First group is all actual matched VLANs */
1119 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1120 	g->mlg_rx_vlan_fg = fg;
1121 	list_insert_tail(&ft->mlft_groups, fg);
1122 	fg->mlfg_table = ft;
1123 	fg->mlfg_size = ft->mlft_nents - 2;
1124 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1125 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1126 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1127 		mutex_exit(&ft->mlft_mtx);
1128 		mutex_exit(&g->mlg_mtx);
1129 		return (B_FALSE);
1130 	}
1131 
1132 	/*
1133 	 * Then the "default" entry which we enable when we have no VLAN IDs
1134 	 * added to the group (we start with this enabled).
1135 	 */
1136 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1137 	g->mlg_rx_vlan_def_fg = fg;
1138 	list_insert_tail(&ft->mlft_groups, fg);
1139 	fg->mlfg_table = ft;
1140 	fg->mlfg_size = 1;
1141 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1142 		mutex_exit(&ft->mlft_mtx);
1143 		mutex_exit(&g->mlg_mtx);
1144 		return (B_FALSE);
1145 	}
1146 	fe = list_head(&fg->mlfg_entries);
1147 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1148 		mutex_exit(&ft->mlft_mtx);
1149 		mutex_exit(&g->mlg_mtx);
1150 		return (B_FALSE);
1151 	}
1152 
1153 	/*
1154 	 * Finally, the promisc entry which points at the *hash ft* from the
1155 	 * default group. We only enable this when we have promisc on.
1156 	 */
1157 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1158 	g->mlg_rx_vlan_promisc_fg = fg;
1159 	list_insert_tail(&ft->mlft_groups, fg);
1160 	fg->mlfg_table = ft;
1161 	fg->mlfg_size = 1;
1162 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1163 		mutex_exit(&ft->mlft_mtx);
1164 		mutex_exit(&g->mlg_mtx);
1165 		return (B_FALSE);
1166 	}
1167 	fe = list_head(&fg->mlfg_entries);
1168 	fe->mlfe_ndest = 1;
1169 	fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1170 
1171 	mutex_exit(&ft->mlft_mtx);
1172 
1173 	mutex_exit(&g->mlg_mtx);
1174 
1175 	return (B_TRUE);
1176 }
1177 
1178 boolean_t
1179 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1180     mlxcx_work_queue_t *rq)
1181 {
1182 	uint_t j;
1183 	mlxcx_buffer_t *b;
1184 	mlxcx_completion_queue_t *cq;
1185 
1186 	mutex_enter(&g->mlg_mtx);
1187 	/*
1188 	 * Sadly, even though MAC has the mgi_start callback, it is not always
1189 	 * called -- in particular when we are being managed under an aggr, the
1190 	 * mgi_start callback will only ever be called on the default group.
1191 	 *
1192 	 * So instead of asserting about the group state here, we have to
1193 	 * check it and call group start if needed.
1194 	 */
1195 	if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1196 		mutex_exit(&g->mlg_mtx);
1197 		if (!mlxcx_rx_group_start(mlxp, g))
1198 			return (B_FALSE);
1199 		mutex_enter(&g->mlg_mtx);
1200 	}
1201 	ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1202 
1203 	cq = rq->mlwq_cq;
1204 	ASSERT(cq != NULL);
1205 
1206 	mutex_enter(&cq->mlcq_mtx);
1207 	mutex_enter(&rq->mlwq_mtx);
1208 
1209 	if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1210 		mutex_exit(&rq->mlwq_mtx);
1211 		mutex_exit(&cq->mlcq_mtx);
1212 		mutex_exit(&g->mlg_mtx);
1213 		return (B_TRUE);
1214 	}
1215 
1216 	if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1217 		mutex_exit(&rq->mlwq_mtx);
1218 		mutex_exit(&cq->mlcq_mtx);
1219 		mutex_exit(&g->mlg_mtx);
1220 		return (B_FALSE);
1221 	}
1222 	ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1223 
1224 	ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1225 	rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1226 
1227 	mlxcx_shard_ready(rq->mlwq_bufs);
1228 
1229 	for (j = 0; j < rq->mlwq_nents; ++j) {
1230 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1231 			break;
1232 		mlxcx_buf_return(mlxp, b);
1233 	}
1234 	for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1235 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1236 			break;
1237 		mlxcx_buf_return(mlxp, b);
1238 	}
1239 
1240 	mlxcx_rq_refill(mlxp, rq);
1241 
1242 	mutex_exit(&rq->mlwq_mtx);
1243 	mutex_exit(&cq->mlcq_mtx);
1244 	mutex_exit(&g->mlg_mtx);
1245 
1246 	return (B_TRUE);
1247 }
1248 
1249 boolean_t
1250 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1251 {
1252 	mlxcx_flow_table_t *ft;
1253 	mlxcx_flow_group_t *fg;
1254 	mlxcx_flow_entry_t *fe;
1255 	char tq_name[TASKQ_NAMELEN];
1256 
1257 	mutex_enter(&g->mlg_mtx);
1258 
1259 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1260 		mutex_exit(&g->mlg_mtx);
1261 		return (B_TRUE);
1262 	}
1263 
1264 	ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1265 
1266 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1267 
1268 	(void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
1269 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
1270 	    g - &mlxp->mlx_rx_groups[0]);
1271 
1272 	/*
1273 	 * Create one refill taskq per group with one thread per work queue.
1274 	 * The refill task may block waiting for resources, so by effectively
1275 	 * having one thread per work queue we avoid work queues blocking each
1276 	 * other.
1277 	 */
1278 	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
1279 	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
1280 		mlxcx_warn(mlxp, "failed to create rq refill task queue");
1281 		mutex_exit(&g->mlg_mtx);
1282 		return (B_FALSE);
1283 	}
1284 
1285 	if (g == &mlxp->mlx_rx_groups[0]) {
1286 		ft = g->mlg_port->mlp_rx_flow;
1287 		mutex_enter(&ft->mlft_mtx);
1288 
1289 		/*
1290 		 * Broadcast and promisc entries go directly to group 0's
1291 		 * RSS hash fanout flow table. They bypass VLAN filtering.
1292 		 */
1293 		fg = g->mlg_port->mlp_bcast;
1294 		fe = list_head(&fg->mlfg_entries);
1295 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1296 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1297 			mutex_exit(&ft->mlft_mtx);
1298 			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
1299 			taskq_destroy(g->mlg_refill_tq);
1300 			mutex_exit(&g->mlg_mtx);
1301 			return (B_FALSE);
1302 		}
1303 
1304 		fg = g->mlg_port->mlp_promisc;
1305 		fe = list_head(&fg->mlfg_entries);
1306 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1307 		/*
1308 		 * Don't actually set the promisc entry until promisc is
1309 		 * enabled.
1310 		 */
1311 
1312 		mutex_exit(&ft->mlft_mtx);
1313 	}
1314 
1315 	mutex_exit(&g->mlg_mtx);
1316 
1317 	return (B_TRUE);
1318 }
1319 
1320 boolean_t
1321 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1322 {
1323 	mlxcx_event_queue_t *eq;
1324 	mlxcx_completion_queue_t *cq;
1325 	mlxcx_work_queue_t *sq;
1326 	uint_t i;
1327 
1328 	ASSERT3S(g->mlg_state, ==, 0);
1329 
1330 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1331 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1332 	g->mlg_state |= MLXCX_GROUP_INIT;
1333 	mutex_enter(&g->mlg_mtx);
1334 
1335 	g->mlg_mlx = mlxp;
1336 	g->mlg_type = MLXCX_GROUP_TX;
1337 	g->mlg_port = &mlxp->mlx_ports[0];
1338 
1339 	g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1340 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1341 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1342 	g->mlg_state |= MLXCX_GROUP_WQS;
1343 
1344 	g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1345 
1346 	if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1347 		mutex_exit(&g->mlg_mtx);
1348 		return (B_FALSE);
1349 	}
1350 
1351 	g->mlg_state |= MLXCX_GROUP_TIRTIS;
1352 
1353 	for (i = 0; i < g->mlg_nwqs; ++i) {
1354 		eq = NULL;
1355 		while (eq == NULL) {
1356 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1357 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1358 				mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
1359 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1360 			    eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1361 				/* Try the next one */
1362 				eq = NULL;
1363 			}
1364 		}
1365 
1366 		if (!mlxcx_cq_setup(mlxp, eq, &cq,
1367 		    mlxp->mlx_props.mldp_cq_size_shift))
1368 			return (B_FALSE);
1369 
1370 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
1371 
1372 		sq = &g->mlg_wqs[i];
1373 		if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1374 			mutex_exit(&g->mlg_mtx);
1375 			return (B_FALSE);
1376 		}
1377 		sq->mlwq_group = g;
1378 	}
1379 
1380 	mutex_exit(&g->mlg_mtx);
1381 
1382 	return (B_TRUE);
1383 }
1384 
1385 boolean_t
1386 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1387     mlxcx_work_queue_t *sq)
1388 {
1389 	uint_t i;
1390 	mlxcx_buffer_t *b;
1391 	mlxcx_completion_queue_t *cq;
1392 
1393 	mutex_enter(&g->mlg_mtx);
1394 
1395 	cq = sq->mlwq_cq;
1396 	ASSERT(cq != NULL);
1397 
1398 	mutex_enter(&cq->mlcq_mtx);
1399 	mutex_enter(&sq->mlwq_mtx);
1400 	if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1401 		mutex_exit(&sq->mlwq_mtx);
1402 		mutex_exit(&cq->mlcq_mtx);
1403 		mutex_exit(&g->mlg_mtx);
1404 		return (B_TRUE);
1405 	}
1406 
1407 	ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1408 	for (i = 0; i < sq->mlwq_nents; ++i) {
1409 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1410 			break;
1411 		mlxcx_buf_return(mlxp, b);
1412 	}
1413 	for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1414 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1415 			break;
1416 		mlxcx_buf_return(mlxp, b);
1417 	}
1418 	for (i = 0; i < sq->mlwq_nents; ++i) {
1419 		if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1420 			break;
1421 		mlxcx_buf_return(mlxp, b);
1422 	}
1423 	sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1424 
1425 	mlxcx_shard_ready(sq->mlwq_bufs);
1426 	mlxcx_shard_ready(sq->mlwq_foreign_bufs);
1427 
1428 	if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1429 		mutex_exit(&sq->mlwq_mtx);
1430 		mutex_exit(&cq->mlcq_mtx);
1431 		mutex_exit(&g->mlg_mtx);
1432 		return (B_FALSE);
1433 	}
1434 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1435 
1436 	(void) mlxcx_sq_add_nop(mlxp, sq);
1437 
1438 	mutex_exit(&sq->mlwq_mtx);
1439 	mutex_exit(&cq->mlcq_mtx);
1440 	mutex_exit(&g->mlg_mtx);
1441 
1442 	return (B_TRUE);
1443 }
1444 
1445 static boolean_t
1446 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1447 {
1448 	uint_t idx;
1449 	mlxcx_bf_t *bf;
1450 	ddi_fm_error_t err;
1451 	uint_t try = 0;
1452 
1453 	ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1454 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1455 
1456 	mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1457 
1458 	ASSERT(mlwq->mlwq_cq != NULL);
1459 	ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1460 	idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1461 	bf = &mlwq->mlwq_uar->mlu_bf[idx];
1462 
1463 retry:
1464 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1465 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1466 	    DDI_FME_VERSION);
1467 	if (err.fme_status != DDI_FM_OK) {
1468 		if (try++ < mlxcx_doorbell_tries) {
1469 			ddi_fm_dma_err_clear(
1470 			    mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1471 			    DDI_FME_VERSION);
1472 			goto retry;
1473 		} else {
1474 			goto err;
1475 		}
1476 	}
1477 
1478 	mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1479 	    mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1480 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1481 	    DDI_FME_VERSION);
1482 	if (err.fme_status == DDI_FM_OK)
1483 		return (B_TRUE);
1484 	if (try++ < mlxcx_doorbell_tries) {
1485 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1486 		goto retry;
1487 	}
1488 
1489 err:
1490 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1491 	return (B_FALSE);
1492 }
1493 
1494 boolean_t
1495 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1496 {
1497 	uint_t index, start_pc;
1498 	mlxcx_sendq_ent_t *ent0;
1499 	ddi_fm_error_t err;
1500 
1501 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1502 
1503 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1504 	ent0 = &mlwq->mlwq_send_ent[index];
1505 	start_pc = mlwq->mlwq_pc;
1506 	++mlwq->mlwq_pc;
1507 	/*
1508 	 * This counter is manipulated in the interrupt handler, which
1509 	 * does not hold the mlwq_mtx, hence the atomic.
1510 	 */
1511 	atomic_inc_64(&mlwq->mlwq_wqebb_used);
1512 
1513 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1514 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1515 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1516 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1517 
1518 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1519 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1520 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1521 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1522 
1523 	ent0->mlsqe_control.mlcs_ds = 1;
1524 
1525 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1526 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1527 	    sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1528 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1529 	    DDI_FME_VERSION);
1530 	if (err.fme_status != DDI_FM_OK) {
1531 		return (B_FALSE);
1532 	}
1533 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1534 		return (B_FALSE);
1535 	}
1536 	return (B_TRUE);
1537 }
1538 
1539 boolean_t
1540 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1541     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1542     mlxcx_buffer_t *b0)
1543 {
1544 	uint_t index, first, ents;
1545 	mlxcx_completion_queue_t *cq;
1546 	mlxcx_sendq_ent_t *ent0;
1547 	mlxcx_sendq_extra_ent_t *ent;
1548 	mlxcx_wqe_data_seg_t *seg;
1549 	uint_t ptri, nptr;
1550 	const ddi_dma_cookie_t *c;
1551 	size_t rem;
1552 	uint64_t wqebb_used;
1553 	mlxcx_buffer_t *b;
1554 	ddi_fm_error_t err;
1555 	boolean_t rv;
1556 
1557 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1558 	ASSERT3P(b0->mlb_tx_head, ==, b0);
1559 	ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1560 	cq = mlwq->mlwq_cq;
1561 
1562 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1563 	ent0 = &mlwq->mlwq_send_ent[index];
1564 	b0->mlb_wqe_index = mlwq->mlwq_pc;
1565 	ents = 1;
1566 
1567 	first = index;
1568 
1569 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1570 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1571 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1572 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1573 
1574 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1575 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1576 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1577 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1578 
1579 	VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1580 	set_bits16(&ent0->mlsqe_eth.mles_szflags,
1581 	    MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1582 	if (inlinelen > 0) {
1583 		bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1584 		    inlinelen);
1585 	}
1586 
1587 	ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) /
1588 	    MLXCX_WQE_OCTOWORD;
1589 
1590 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1591 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1592 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1593 		    MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1594 	}
1595 	if (chkflags & HCK_FULLCKSUM) {
1596 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1597 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1598 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1599 	}
1600 
1601 	/*
1602 	 * mlwq_wqebb_used is only incremented whilst holding
1603 	 * the mlwq_mtx mutex, but it is decremented (atomically) in
1604 	 * the interrupt context *not* under mlwq_mtx mutex.
1605 	 * So, now take a snapshot of the number of used wqes which will
1606 	 * be a conistent maximum we can use whilst iterating through
1607 	 * the buffers and DMA cookies.
1608 	 */
1609 	wqebb_used = mlwq->mlwq_wqebb_used;
1610 
1611 	b = b0;
1612 	ptri = 0;
1613 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1614 	seg = ent0->mlsqe_data;
1615 	while (b != NULL) {
1616 		rem = b->mlb_used;
1617 
1618 		c = NULL;
1619 		while (rem > 0 &&
1620 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1621 			if (ptri >= nptr) {
1622 				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
1623 					return (B_FALSE);
1624 
1625 				index = (mlwq->mlwq_pc + ents) &
1626 				    (mlwq->mlwq_nents - 1);
1627 				ent = &mlwq->mlwq_send_extra_ent[index];
1628 				++ents;
1629 
1630 				seg = ent->mlsqe_data;
1631 				ptri = 0;
1632 				nptr = sizeof (ent->mlsqe_data) /
1633 				    sizeof (mlxcx_wqe_data_seg_t);
1634 			}
1635 
1636 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1637 			if (c->dmac_size > rem) {
1638 				seg->mlds_byte_count = to_be32(rem);
1639 				rem = 0;
1640 			} else {
1641 				seg->mlds_byte_count = to_be32(c->dmac_size);
1642 				rem -= c->dmac_size;
1643 			}
1644 			seg->mlds_address = to_be64(c->dmac_laddress);
1645 			++seg;
1646 			++ptri;
1647 			++ent0->mlsqe_control.mlcs_ds;
1648 
1649 			ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1650 			    MLXCX_SQE_MAX_DS);
1651 		}
1652 
1653 		if (b == b0) {
1654 			b = list_head(&b0->mlb_tx_chain);
1655 		} else {
1656 			b = list_next(&b0->mlb_tx_chain, b);
1657 		}
1658 	}
1659 
1660 	b0->mlb_wqebbs = ents;
1661 	mlwq->mlwq_pc += ents;
1662 	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
1663 
1664 	for (; ptri < nptr; ++ptri, ++seg) {
1665 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1666 		seg->mlds_byte_count = to_be32(0);
1667 		seg->mlds_address = to_be64(0);
1668 	}
1669 
1670 	/*
1671 	 * Make sure the workqueue entry is flushed out before updating
1672 	 * the doorbell.
1673 	 * If the ring has wrapped, we need to flush the front and back.
1674 	 */
1675 	if ((first + ents) > mlwq->mlwq_nents) {
1676 		uint_t sync_cnt = mlwq->mlwq_nents - first;
1677 
1678 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1679 		    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1680 		    sync_cnt * sizeof (mlxcx_sendq_ent_t),
1681 		    DDI_DMA_SYNC_FORDEV));
1682 
1683 		ent0 = &mlwq->mlwq_send_ent[0];
1684 		ents -= sync_cnt;
1685 	}
1686 
1687 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1688 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1689 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1690 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1691 	    DDI_FME_VERSION);
1692 	if (err.fme_status != DDI_FM_OK) {
1693 		return (B_FALSE);
1694 	}
1695 
1696 	/*
1697 	 * Hold the bufmtx whilst ringing the doorbell, to prevent
1698 	 * the buffer from being moved to another list, so we can
1699 	 * safely remove it should the ring fail.
1700 	 */
1701 	mutex_enter(&cq->mlcq_bufbmtx);
1702 
1703 	list_insert_tail(&cq->mlcq_buffers_b, b0);
1704 	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
1705 		atomic_inc_64(&cq->mlcq_bufcnt);
1706 	} else {
1707 		list_remove(&cq->mlcq_buffers_b, b0);
1708 	}
1709 
1710 	mutex_exit(&cq->mlcq_bufbmtx);
1711 
1712 	return (rv);
1713 }
1714 
1715 boolean_t
1716 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1717     mlxcx_buffer_t *buf)
1718 {
1719 	return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1720 }
1721 
1722 boolean_t
1723 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1724     mlxcx_buffer_t **bufs, size_t nbufs)
1725 {
1726 	uint_t index;
1727 	mlxcx_recvq_ent_t *ent;
1728 	mlxcx_completion_queue_t *cq;
1729 	mlxcx_wqe_data_seg_t *seg;
1730 	uint_t bi, ptri;
1731 	const ddi_dma_cookie_t *c;
1732 	mlxcx_buffer_t *buf;
1733 	ddi_fm_error_t err;
1734 
1735 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1736 	cq = mlwq->mlwq_cq;
1737 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1738 
1739 	for (bi = 0; bi < nbufs; ++bi) {
1740 		buf = bufs[bi];
1741 		bufs[bi] = NULL;
1742 		ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1743 
1744 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1745 		ent = &mlwq->mlwq_recv_ent[index];
1746 		buf->mlb_wqe_index = mlwq->mlwq_pc;
1747 		buf->mlb_wqebbs = 1;
1748 
1749 		++mlwq->mlwq_pc;
1750 		atomic_inc_64(&mlwq->mlwq_wqebb_used);
1751 
1752 		mutex_enter(&cq->mlcq_bufbmtx);
1753 		list_insert_tail(&cq->mlcq_buffers, buf);
1754 		atomic_inc_64(&cq->mlcq_bufcnt);
1755 		mutex_exit(&cq->mlcq_bufbmtx);
1756 
1757 		ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1758 		ptri = 0;
1759 		c = NULL;
1760 		while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1761 			seg = &ent->mlrqe_data[ptri++];
1762 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1763 			seg->mlds_byte_count = to_be32(c->dmac_size);
1764 			seg->mlds_address = to_be64(c->dmac_laddress);
1765 		}
1766 		/*
1767 		 * Fill any unused scatter pointers with the special null
1768 		 * value.
1769 		 */
1770 		for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1771 			seg = &ent->mlrqe_data[ptri];
1772 			seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1773 			seg->mlds_byte_count = to_be32(0);
1774 			seg->mlds_address = to_be64(0);
1775 		}
1776 
1777 		/*
1778 		 * Make sure the workqueue entry is flushed out before updating
1779 		 * the doorbell.
1780 		 */
1781 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1782 		    (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1783 		    sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1784 		ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1785 		    DDI_FME_VERSION);
1786 		if (err.fme_status != DDI_FM_OK) {
1787 			return (B_FALSE);
1788 		}
1789 	}
1790 
1791 	mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1792 	/*
1793 	 * Flush the CQ doorbell as well so that HW knows how many
1794 	 * completions we've consumed.
1795 	 */
1796 	MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1797 	ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1798 	    DDI_FME_VERSION);
1799 	if (err.fme_status != DDI_FM_OK) {
1800 		return (B_FALSE);
1801 	}
1802 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1803 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1804 	    DDI_FME_VERSION);
1805 	if (err.fme_status != DDI_FM_OK) {
1806 		return (B_FALSE);
1807 	}
1808 	return (B_TRUE);
1809 }
1810 
1811 static void
1812 mlxcx_rq_refill_task(void *arg)
1813 {
1814 	mlxcx_work_queue_t *wq = arg;
1815 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1816 	mlxcx_t *mlxp = wq->mlwq_mlx;
1817 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
1818 	boolean_t refill, draining;
1819 
1820 	do {
1821 		/*
1822 		 * Wait here until one of 3 conditions:
1823 		 * 1. The shard is draining, or
1824 		 * 2. There are buffers on the free list, or
1825 		 * 3. The WQ is being shut down.
1826 		 */
1827 		mutex_enter(&s->mlbs_mtx);
1828 		while (s->mlbs_state != MLXCX_SHARD_DRAINING &&
1829 		    list_is_empty(&s->mlbs_free) &&
1830 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) {
1831 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1832 		}
1833 
1834 		draining = (s->mlbs_state == MLXCX_SHARD_DRAINING);
1835 		mutex_exit(&s->mlbs_mtx);
1836 
1837 		mutex_enter(&cq->mlcq_mtx);
1838 		mutex_enter(&wq->mlwq_mtx);
1839 
1840 		if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
1841 			refill = B_FALSE;
1842 			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1843 		} else {
1844 			mlxcx_rq_refill(mlxp, wq);
1845 
1846 			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
1847 				refill = B_TRUE;
1848 			} else {
1849 				refill = B_FALSE;
1850 				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1851 			}
1852 		}
1853 
1854 		mutex_exit(&wq->mlwq_mtx);
1855 		mutex_exit(&cq->mlcq_mtx);
1856 	} while (refill);
1857 }
1858 
1859 void
1860 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1861 {
1862 	size_t target, current, want, done, n;
1863 	mlxcx_completion_queue_t *cq;
1864 	mlxcx_ring_group_t *g;
1865 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1866 	uint_t i;
1867 
1868 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1869 	cq = mlwq->mlwq_cq;
1870 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1871 
1872 	ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1873 
1874 	target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1875 	cq = mlwq->mlwq_cq;
1876 
1877 	if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0)
1878 		return;
1879 
1880 	if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0)
1881 		return;
1882 
1883 	current = cq->mlcq_bufcnt;
1884 
1885 	if (current >= target - MLXCX_RQ_REFILL_STEP)
1886 		return;
1887 
1888 	want = target - current;
1889 	done = 0;
1890 
1891 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1892 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1893 		if (n == 0) {
1894 			/*
1895 			 * We didn't get any buffers from the free queue.
1896 			 * It might not be an issue, schedule a taskq
1897 			 * to wait for free buffers if the completion
1898 			 * queue is low.
1899 			 */
1900 			if (current < MLXCX_RQ_REFILL_STEP &&
1901 			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
1902 				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
1903 				g = mlwq->mlwq_group;
1904 				taskq_dispatch_ent(g->mlg_refill_tq,
1905 				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
1906 				    &mlwq->mlwq_tqe);
1907 			}
1908 
1909 			return;
1910 		}
1911 
1912 		if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) {
1913 			for (i = 0; i < n; ++i)
1914 				mlxcx_buf_return(mlxp, b[i]);
1915 			return;
1916 		}
1917 		if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1918 			/*
1919 			 * mlxcx_rq_add_buffers NULLs out the buffers as it
1920 			 * enqueues them, so any that are non-NULL we have to
1921 			 * free now. The others now belong to the WQ, even if
1922 			 * we failed.
1923 			 */
1924 			for (i = 0; i < n; ++i) {
1925 				if (b[i] != NULL) {
1926 					mlxcx_buf_return(mlxp, b[i]);
1927 				}
1928 			}
1929 			return;
1930 		}
1931 		done += n;
1932 	}
1933 }
1934 
1935 static const char *
1936 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1937 {
1938 	switch (sy) {
1939 	case MLXCX_CQ_ERR_LOCAL_LENGTH:
1940 		return ("LOCAL_LENGTH");
1941 	case MLXCX_CQ_ERR_LOCAL_QP_OP:
1942 		return ("LOCAL_QP_OP");
1943 	case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1944 		return ("LOCAL_PROTECTION");
1945 	case MLXCX_CQ_ERR_WR_FLUSHED:
1946 		return ("WR_FLUSHED");
1947 	case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1948 		return ("MEM_WINDOW_BIND");
1949 	case MLXCX_CQ_ERR_BAD_RESPONSE:
1950 		return ("BAD_RESPONSE");
1951 	case MLXCX_CQ_ERR_LOCAL_ACCESS:
1952 		return ("LOCAL_ACCESS");
1953 	case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1954 		return ("XPORT_RETRY_CTR");
1955 	case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1956 		return ("RNR_RETRY_CTR");
1957 	case MLXCX_CQ_ERR_ABORTED:
1958 		return ("ABORTED");
1959 	default:
1960 		return ("UNKNOWN");
1961 	}
1962 }
1963 
1964 static void
1965 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1966     mlxcx_completionq_error_ent_t *ent)
1967 {
1968 	uint64_t ena;
1969 	char buf[FM_MAX_CLASS];
1970 	const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1971 
1972 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1973 		return;
1974 
1975 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1976 	    MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1977 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1978 
1979 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1980 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1981 	    "syndrome", DATA_TYPE_STRING, name,
1982 	    "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1983 	    "vendor_syndrome", DATA_TYPE_UINT8,
1984 	    ent->mlcqee_vendor_error_syndrome,
1985 	    "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1986 	    "wq_type", DATA_TYPE_STRING,
1987 	    (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1988 	    "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1989 	    "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1990 	    NULL);
1991 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1992 }
1993 
1994 void
1995 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1996     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1997 {
1998 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1999 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
2000 		mlxcx_completionq_error_ent_t *eent =
2001 		    (mlxcx_completionq_error_ent_t *)ent;
2002 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2003 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2004 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2005 		mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
2006 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2007 		return;
2008 	}
2009 
2010 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
2011 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2012 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2013 		return;
2014 	}
2015 
2016 	if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
2017 		mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
2018 		    ent->mlcqe_send_wqe_opcode);
2019 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2020 		return;
2021 	}
2022 
2023 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2024 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2025 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2026 		return;
2027 	}
2028 
2029 	mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2030 }
2031 
2032 mblk_t *
2033 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
2034     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
2035 {
2036 	uint32_t chkflags = 0;
2037 	uint_t wqe_index, used;
2038 	ddi_fm_error_t err;
2039 	mblk_t *mp;
2040 
2041 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
2042 
2043 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
2044 		mlxcx_completionq_error_ent_t *eent =
2045 		    (mlxcx_completionq_error_ent_t *)ent;
2046 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2047 		mlxcx_buf_return(mlxp, buf);
2048 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2049 		mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
2050 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2051 		return (NULL);
2052 	}
2053 
2054 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
2055 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2056 		mlxcx_buf_return(mlxp, buf);
2057 		return (NULL);
2058 	}
2059 
2060 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2061 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2062 		mlxcx_buf_return(mlxp, buf);
2063 		return (NULL);
2064 	}
2065 
2066 	if (ent->mlcqe_rx_drop_counter > 0) {
2067 		atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
2068 		    ent->mlcqe_rx_drop_counter);
2069 	}
2070 
2071 	MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
2072 	ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
2073 	    DDI_FME_VERSION);
2074 	if (err.fme_status != DDI_FM_OK) {
2075 		ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
2076 		    DDI_FME_VERSION);
2077 		mlxcx_buf_return(mlxp, buf);
2078 		return (NULL);
2079 	}
2080 
2081 	/*
2082 	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
2083 	 * Remember it for later.
2084 	 */
2085 	wqe_index = buf->mlb_wqe_index;
2086 
2087 	/* Set the used field with the actual length of the packet. */
2088 	buf->mlb_used = (used = from_be32(ent->mlcqe_byte_cnt));
2089 
2090 	/* Try to loan this buffer to MAC directly. */
2091 	if (mlxcx_buf_loan(mlxp, buf)) {
2092 		mp = buf->mlb_mp;
2093 
2094 	} else {
2095 		/*
2096 		 * Loan rejected: we will try to allocate a new mblk and copy
2097 		 * this packet for MAC instead.
2098 		 */
2099 		mp = allocb(buf->mlb_used, 0);
2100 		if (mp == NULL) {
2101 			/* No memory :( */
2102 			atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 1);
2103 			mlxcx_buf_return(mlxp, buf);
2104 			return (NULL);
2105 		}
2106 		bcopy((unsigned char *)buf->mlb_dma.mxdb_va, mp->b_rptr,
2107 		    buf->mlb_used);
2108 
2109 		/* We're done with this buf now, return it to the free list. */
2110 		mlxcx_buf_return(mlxp, buf);
2111 		buf = NULL;
2112 	}
2113 
2114 	mp->b_next = NULL;
2115 	mp->b_cont = NULL;
2116 	mp->b_wptr = mp->b_rptr + used;
2117 
2118 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
2119 		chkflags |= HCK_FULLCKSUM_OK;
2120 	}
2121 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
2122 		chkflags |= HCK_IPV4_HDRCKSUM_OK;
2123 	}
2124 	if (chkflags != 0) {
2125 		mac_hcksum_set(mp, 0, 0, 0, from_be16(ent->mlcqe_checksum),
2126 		    chkflags);
2127 	}
2128 
2129 	/*
2130 	 * Don't check if a refill is needed on every single completion,
2131 	 * since checking involves taking the RQ lock.
2132 	 */
2133 	if ((wqe_index & 0x7) == 0) {
2134 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
2135 		ASSERT(wq != NULL);
2136 		mutex_enter(&wq->mlwq_mtx);
2137 		if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
2138 			mlxcx_rq_refill(mlxp, wq);
2139 		mutex_exit(&wq->mlwq_mtx);
2140 	}
2141 
2142 	return (mp);
2143 }
2144 
2145 static void
2146 mlxcx_buf_mp_return(caddr_t arg)
2147 {
2148 	mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
2149 	mlxcx_t *mlxp = b->mlb_mlx;
2150 
2151 	/* The mblk has been used now, so NULL it out. */
2152 	b->mlb_mp = NULL;
2153 
2154 	if (b->mlb_state == MLXCX_BUFFER_ON_LOAN)
2155 		mlxcx_buf_return(mlxp, b);
2156 }
2157 
2158 boolean_t
2159 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
2160 {
2161 	mlxcx_buffer_t *b;
2162 	ddi_device_acc_attr_t acc;
2163 	ddi_dma_attr_t attr;
2164 	boolean_t ret;
2165 
2166 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2167 	b->mlb_shard = shard;
2168 	b->mlb_foreign = B_FALSE;
2169 
2170 	mlxcx_dma_acc_attr(mlxp, &acc);
2171 	mlxcx_dma_buf_attr(mlxp, &attr);
2172 
2173 	ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
2174 	    B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
2175 	if (!ret) {
2176 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2177 		return (B_FALSE);
2178 	}
2179 
2180 	b->mlb_frtn.free_func = mlxcx_buf_mp_return;
2181 	b->mlb_frtn.free_arg = (caddr_t)b;
2182 	b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2183 	    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2184 
2185 	*bp = b;
2186 
2187 	return (B_TRUE);
2188 }
2189 
2190 boolean_t
2191 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
2192     mlxcx_buffer_t **bp)
2193 {
2194 	mlxcx_buffer_t *b;
2195 	ddi_dma_attr_t attr;
2196 	boolean_t ret;
2197 
2198 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2199 	b->mlb_shard = shard;
2200 	b->mlb_foreign = B_TRUE;
2201 
2202 	mlxcx_dma_buf_attr(mlxp, &attr);
2203 
2204 	ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
2205 	if (!ret) {
2206 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2207 		return (B_FALSE);
2208 	}
2209 
2210 	*bp = b;
2211 
2212 	return (B_TRUE);
2213 }
2214 
2215 static mlxcx_buffer_t *
2216 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2217 {
2218 	mlxcx_buffer_t *b;
2219 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
2220 
2221 	mutex_enter(&s->mlbs_mtx);
2222 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2223 		mutex_exit(&s->mlbs_mtx);
2224 		return (NULL);
2225 	}
2226 
2227 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2228 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2229 		ASSERT(b->mlb_foreign);
2230 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2231 		list_insert_tail(&s->mlbs_busy, b);
2232 	}
2233 	mutex_exit(&s->mlbs_mtx);
2234 
2235 	return (b);
2236 }
2237 
2238 static mlxcx_buffer_t *
2239 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
2240 {
2241 	ddi_fm_error_t err;
2242 	mlxcx_buffer_t *b;
2243 	uint_t attempts = 0;
2244 
2245 copyb:
2246 	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
2247 		return (NULL);
2248 
2249 	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2250 	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2251 
2252 	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2253 
2254 	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2255 	    DDI_FME_VERSION);
2256 	if (err.fme_status != DDI_FM_OK) {
2257 		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2258 		    DDI_FME_VERSION);
2259 		mlxcx_buf_return(mlxp, b);
2260 		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2261 			return (NULL);
2262 		}
2263 		goto copyb;
2264 	}
2265 
2266 	return (b);
2267 }
2268 
2269 static mlxcx_buffer_t *
2270 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2271     mblk_t *mp, size_t off)
2272 {
2273 	mlxcx_buffer_t *b;
2274 	uint8_t *rptr;
2275 	size_t sz;
2276 	boolean_t ret;
2277 
2278 	rptr = mp->b_rptr;
2279 	sz = MBLKL(mp);
2280 
2281 #ifdef DEBUG
2282 	if (off > 0) {
2283 		ASSERT3U(off, <, sz);
2284 	}
2285 #endif
2286 
2287 	rptr += off;
2288 	sz -= off;
2289 
2290 	if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
2291 		b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2292 	} else {
2293 		b = mlxcx_buf_take_foreign(mlxp, wq);
2294 		if (b == NULL)
2295 			return (NULL);
2296 
2297 		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
2298 		    B_FALSE);
2299 
2300 		if (!ret) {
2301 			mlxcx_buf_return(mlxp, b);
2302 
2303 			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2304 		}
2305 	}
2306 
2307 	return (b);
2308 }
2309 
2310 uint_t
2311 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2312     mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
2313 {
2314 	mlxcx_buffer_t *b, *b0 = NULL;
2315 	boolean_t first = B_TRUE;
2316 	mblk_t *mp;
2317 	size_t offset = off;
2318 	size_t ncookies = 0;
2319 	uint_t count = 0;
2320 
2321 	for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS;
2322 	    mp = mp->b_cont) {
2323 		b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset);
2324 		if (b == NULL)
2325 			goto failed;
2326 
2327 		ncookies += b->mlb_dma.mxdb_ncookies;
2328 
2329 		if (first)
2330 			b0 = b;
2331 
2332 		if (!first)
2333 			b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2334 
2335 		b->mlb_tx_mp = mp;
2336 		b->mlb_tx_head = b0;
2337 		b->mlb_used = MBLKL(mp) - offset;
2338 
2339 		if (!first)
2340 			list_insert_tail(&b0->mlb_tx_chain, b);
2341 		first = B_FALSE;
2342 		offset = 0;
2343 
2344 		count++;
2345 	}
2346 
2347 	/*
2348 	 * The chain of mblks has resulted in too many cookies for
2349 	 * a single message. This is unusual, so take the hit to tidy
2350 	 * up, do a pullup to a single mblk and allocate the requisite
2351 	 * buf.
2352 	 */
2353 	if (ncookies > MLXCX_SQE_MAX_PTRS) {
2354 		DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq,
2355 		    mblk_t *, mpb, size_t, ncookies);
2356 
2357 		if (b0 != NULL)
2358 			mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2359 
2360 		if ((mp = msgpullup(mpb, -1)) == NULL)
2361 			return (0);
2362 
2363 		b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off);
2364 		if (b0 == NULL) {
2365 			freemsg(mp);
2366 			return (0);
2367 		}
2368 		freemsg(mpb);
2369 
2370 		b0->mlb_tx_mp = mp;
2371 		b0->mlb_tx_head = b0;
2372 		b0->mlb_used = MBLKL(mp) - off;
2373 
2374 		count = 1;
2375 	}
2376 
2377 	*bp = b0;
2378 
2379 	return (count);
2380 
2381 failed:
2382 	if (b0 != NULL)
2383 		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2384 
2385 	return (0);
2386 }
2387 
2388 mlxcx_buffer_t *
2389 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2390 {
2391 	mlxcx_buffer_t *b;
2392 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2393 
2394 	mutex_enter(&s->mlbs_mtx);
2395 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2396 		mutex_exit(&s->mlbs_mtx);
2397 		return (NULL);
2398 	}
2399 
2400 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2401 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2402 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2403 		list_insert_tail(&s->mlbs_busy, b);
2404 	}
2405 	mutex_exit(&s->mlbs_mtx);
2406 
2407 	return (b);
2408 }
2409 
2410 size_t
2411 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp,
2412     size_t nbufs)
2413 {
2414 	mlxcx_buffer_t *b;
2415 	size_t done = 0;
2416 	mlxcx_buf_shard_t *s;
2417 
2418 	s = wq->mlwq_bufs;
2419 
2420 	mutex_enter(&s->mlbs_mtx);
2421 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2422 		mutex_exit(&s->mlbs_mtx);
2423 		return (0);
2424 	}
2425 
2426 	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
2427 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2428 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2429 		list_insert_tail(&s->mlbs_busy, b);
2430 		bp[done++] = b;
2431 	}
2432 	mutex_exit(&s->mlbs_mtx);
2433 	return (done);
2434 }
2435 
2436 boolean_t
2437 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2438 {
2439 	mlxcx_buf_shard_t *s = b->mlb_shard;
2440 
2441 	VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2442 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2443 
2444 	if (b->mlb_mp == NULL) {
2445 		b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2446 		    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2447 		if (b->mlb_mp == NULL)
2448 			return (B_FALSE);
2449 	}
2450 
2451 	mutex_enter(&s->mlbs_mtx);
2452 
2453 	/* Check if we have too many buffers on loan. */
2454 	if (s->mlbs_nloaned >= s->mlbs_hiwat1 &&
2455 	    b->mlb_used < mlxp->mlx_props.mldp_rx_p50_loan_min_size) {
2456 		mutex_exit(&s->mlbs_mtx);
2457 		return (B_FALSE);
2458 	} else if (s->mlbs_nloaned >= s->mlbs_hiwat2) {
2459 		mutex_exit(&s->mlbs_mtx);
2460 		return (B_FALSE);
2461 	}
2462 
2463 	b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2464 	b->mlb_wqe_index = 0;
2465 	list_remove(&s->mlbs_busy, b);
2466 	list_insert_tail(&s->mlbs_loaned, b);
2467 	s->mlbs_nloaned++;
2468 	mutex_exit(&s->mlbs_mtx);
2469 
2470 	return (B_TRUE);
2471 }
2472 
2473 void
2474 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2475 {
2476 	mlxcx_buffer_t *b;
2477 
2478 	if (b0->mlb_tx_head != b0) {
2479 		mlxcx_buf_return(mlxp, b0);
2480 		return;
2481 	}
2482 
2483 	while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2484 		mlxcx_buf_return(mlxp, b);
2485 	}
2486 	if (keepmp) {
2487 		b0->mlb_tx_mp = NULL;
2488 		b0->mlb_tx_head = NULL;
2489 	}
2490 	mlxcx_buf_return(mlxp, b0);
2491 }
2492 
2493 inline void
2494 mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr)
2495 {
2496 	s->mlbs_ntotal += incr;
2497 	s->mlbs_hiwat1 = s->mlbs_ntotal / 2;
2498 	s->mlbs_hiwat2 = 3 * (s->mlbs_ntotal / 4);
2499 }
2500 
2501 void
2502 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2503 {
2504 	mlxcx_buffer_state_t oldstate = b->mlb_state;
2505 	mlxcx_buffer_t *txhead = b->mlb_tx_head;
2506 	mlxcx_buf_shard_t *s = b->mlb_shard;
2507 	mblk_t *mp = b->mlb_tx_mp;
2508 
2509 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2510 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2511 
2512 	/*
2513 	 * The mlbs_mtx held below is a heavily contended lock, so it is
2514 	 * imperative we do as much of the buffer clean up outside the lock
2515 	 * as is possible.
2516 	 */
2517 	b->mlb_state = MLXCX_BUFFER_FREE;
2518 	b->mlb_wqe_index = 0;
2519 	b->mlb_tx_head = NULL;
2520 	b->mlb_tx_mp = NULL;
2521 	b->mlb_used = 0;
2522 	b->mlb_wqebbs = 0;
2523 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2524 
2525 	if (b->mlb_foreign) {
2526 		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2527 			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2528 		}
2529 	}
2530 
2531 	mutex_enter(&s->mlbs_mtx);
2532 	switch (oldstate) {
2533 	case MLXCX_BUFFER_INIT:
2534 		mlxcx_bufshard_adjust_total(s, 1);
2535 		break;
2536 	case MLXCX_BUFFER_ON_WQ:
2537 		list_remove(&s->mlbs_busy, b);
2538 		break;
2539 	case MLXCX_BUFFER_ON_LOAN:
2540 		ASSERT(!b->mlb_foreign);
2541 		--s->mlbs_nloaned;
2542 		list_remove(&s->mlbs_loaned, b);
2543 		if (s->mlbs_state == MLXCX_SHARD_DRAINING) {
2544 			/*
2545 			 * When we're draining, Eg during mac_stop(),
2546 			 * we destroy the buffer immediately rather than
2547 			 * recycling it. Otherwise we risk leaving it
2548 			 * on the free list and leaking it.
2549 			 */
2550 			list_insert_tail(&s->mlbs_free, b);
2551 			mlxcx_buf_destroy(mlxp, b);
2552 			/*
2553 			 * Teardown might be waiting for loaned list to empty.
2554 			 */
2555 			cv_broadcast(&s->mlbs_free_nonempty);
2556 			mutex_exit(&s->mlbs_mtx);
2557 			return;
2558 		}
2559 		break;
2560 	case MLXCX_BUFFER_FREE:
2561 		VERIFY(0);
2562 		break;
2563 	case MLXCX_BUFFER_ON_CHAIN:
2564 		ASSERT(txhead != NULL);
2565 		list_remove(&txhead->mlb_tx_chain, b);
2566 		list_remove(&s->mlbs_busy, b);
2567 		break;
2568 	}
2569 
2570 	list_insert_tail(&s->mlbs_free, b);
2571 	cv_broadcast(&s->mlbs_free_nonempty);
2572 
2573 	mutex_exit(&s->mlbs_mtx);
2574 
2575 	/*
2576 	 * For TX chain heads, free the mblk_t after we let go of the lock.
2577 	 * This might be a borrowed buf that we in turn loaned to MAC, in which
2578 	 * case calling freemsg() on it will re-enter this very function -- so
2579 	 * we better not be holding the lock!
2580 	 */
2581 	if (txhead == b)
2582 		freemsg(mp);
2583 }
2584 
2585 void
2586 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2587 {
2588 	mlxcx_buf_shard_t *s = b->mlb_shard;
2589 
2590 	VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2591 	    b->mlb_state == MLXCX_BUFFER_INIT);
2592 	ASSERT(mutex_owned(&s->mlbs_mtx));
2593 
2594 	if (b->mlb_state == MLXCX_BUFFER_FREE) {
2595 		list_remove(&s->mlbs_free, b);
2596 		mlxcx_bufshard_adjust_total(s, -1);
2597 	}
2598 
2599 	/*
2600 	 * This is going back to the kmem cache, so it needs to be set up in
2601 	 * the same way we expect a new buffer to come out (state INIT, other
2602 	 * fields NULL'd)
2603 	 */
2604 	b->mlb_state = MLXCX_BUFFER_INIT;
2605 	b->mlb_shard = NULL;
2606 	if (b->mlb_mp != NULL) {
2607 		freeb(b->mlb_mp);
2608 		ASSERT(b->mlb_mp == NULL);
2609 	}
2610 	mlxcx_dma_free(&b->mlb_dma);
2611 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2612 
2613 	kmem_cache_free(mlxp->mlx_bufs_cache, b);
2614 }
2615 
2616 void
2617 mlxcx_shard_ready(mlxcx_buf_shard_t *s)
2618 {
2619 	mutex_enter(&s->mlbs_mtx);
2620 	s->mlbs_state = MLXCX_SHARD_READY;
2621 	mutex_exit(&s->mlbs_mtx);
2622 }
2623 
2624 void
2625 mlxcx_shard_draining(mlxcx_buf_shard_t *s)
2626 {
2627 	mutex_enter(&s->mlbs_mtx);
2628 	s->mlbs_state = MLXCX_SHARD_DRAINING;
2629 	cv_broadcast(&s->mlbs_free_nonempty);
2630 	mutex_exit(&s->mlbs_mtx);
2631 }
2632