xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_ring.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 #include <sys/sdt.h>
29 
30 #include <sys/pattr.h>
31 #include <sys/dlpi.h>
32 
33 #include <sys/mac_provider.h>
34 
35 #include <sys/random.h>
36 
37 #include <mlxcx.h>
38 
39 boolean_t
40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
41 {
42 	ddi_device_acc_attr_t acc;
43 	ddi_dma_attr_t attr;
44 	boolean_t ret;
45 	size_t sz;
46 
47 	VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
48 
49 	/* Receive and send queue entries might be different sizes. */
50 	switch (mlwq->mlwq_type) {
51 	case MLXCX_WQ_TYPE_SENDQ:
52 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
53 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
54 		sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
55 		break;
56 	case MLXCX_WQ_TYPE_RECVQ:
57 		mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
58 		mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
59 		sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
60 		break;
61 	default:
62 		VERIFY(0);
63 		return (B_FALSE);
64 	}
65 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
66 
67 	mlxcx_dma_acc_attr(mlxp, &acc);
68 	mlxcx_dma_queue_attr(mlxp, &attr);
69 
70 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
71 	    B_TRUE, sz, B_TRUE);
72 	if (!ret) {
73 		mlxcx_warn(mlxp, "failed to allocate WQ memory");
74 		return (B_FALSE);
75 	}
76 
77 	/*
78 	 * Just set the first pointer in the union. Yes, this is a strict
79 	 * aliasing violation. No, I don't care.
80 	 */
81 	mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
82 
83 	mlxcx_dma_acc_attr(mlxp, &acc);
84 	mlxcx_dma_qdbell_attr(mlxp, &attr);
85 	sz = sizeof (mlxcx_workq_doorbell_t);
86 	ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
87 	    B_TRUE, sz, B_TRUE);
88 	if (!ret) {
89 		mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
90 		mlxcx_dma_free(&mlwq->mlwq_dma);
91 		mlwq->mlwq_send_ent = NULL;
92 		return (B_FALSE);
93 	}
94 
95 	mlwq->mlwq_doorbell =
96 	    (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
97 
98 	mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
99 
100 	return (B_TRUE);
101 }
102 
103 void
104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
105 {
106 	VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
107 	if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
108 		VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
109 
110 	mlxcx_dma_free(&mlwq->mlwq_dma);
111 	mlwq->mlwq_send_ent = NULL;
112 	mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
113 	mlwq->mlwq_doorbell = NULL;
114 
115 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
116 }
117 
118 static boolean_t
119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
120     uint_t ent_shift)
121 {
122 	ddi_device_acc_attr_t acc;
123 	ddi_dma_attr_t attr;
124 	boolean_t ret;
125 	size_t sz, i;
126 
127 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
128 
129 	mlcq->mlcq_entshift = ent_shift;
130 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
131 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
132 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
133 
134 	mlxcx_dma_acc_attr(mlxp, &acc);
135 	mlxcx_dma_queue_attr(mlxp, &attr);
136 
137 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
138 	    B_TRUE, sz, B_TRUE);
139 	if (!ret) {
140 		mlxcx_warn(mlxp, "failed to allocate CQ memory");
141 		return (B_FALSE);
142 	}
143 
144 	mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
145 
146 	for (i = 0; i < mlcq->mlcq_nents; ++i) {
147 		mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
148 		mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
149 	}
150 
151 	mlxcx_dma_acc_attr(mlxp, &acc);
152 	mlxcx_dma_qdbell_attr(mlxp, &attr);
153 	sz = sizeof (mlxcx_completionq_doorbell_t);
154 	ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
155 	    B_TRUE, sz, B_TRUE);
156 	if (!ret) {
157 		mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
158 		mlxcx_dma_free(&mlcq->mlcq_dma);
159 		mlcq->mlcq_ent = NULL;
160 		return (B_FALSE);
161 	}
162 
163 	mlcq->mlcq_doorbell =
164 	    (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
165 
166 	mlcq->mlcq_state |= MLXCX_CQ_ALLOC;
167 
168 	return (B_TRUE);
169 }
170 
171 static void
172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
173 {
174 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
175 	if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
176 		VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177 
178 	mlxcx_dma_free(&mlcq->mlcq_dma);
179 	mlcq->mlcq_ent = NULL;
180 	mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
181 	mlcq->mlcq_doorbell = NULL;
182 
183 	mlcq->mlcq_state &= ~MLXCX_CQ_ALLOC;
184 }
185 
186 void
187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
188 {
189 	mlxcx_completion_queue_t *mlcq;
190 
191 	/*
192 	 * If something is holding the lock on a long operation like a
193 	 * refill, setting this flag asks them to exit early if possible.
194 	 */
195 	atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
196 
197 	mutex_enter(&mlwq->mlwq_mtx);
198 
199 	list_remove(&mlxp->mlx_wqs, mlwq);
200 
201 	if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
202 	    !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
203 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
204 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
205 		    !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
206 			mlxcx_warn(mlxp, "failed to stop "
207 			    "recv queue num %x", mlwq->mlwq_num);
208 		}
209 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
210 		    mlwq->mlwq_state & MLXCX_WQ_STARTED &&
211 		    !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
212 			mlxcx_warn(mlxp, "failed to stop "
213 			    "send queue num %x", mlwq->mlwq_num);
214 		}
215 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
216 		    !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
217 			mlxcx_warn(mlxp, "failed to destroy "
218 			    "recv queue num %x", mlwq->mlwq_num);
219 		}
220 		if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
221 		    !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
222 			mlxcx_warn(mlxp, "failed to destroy "
223 			    "send queue num %x", mlwq->mlwq_num);
224 		}
225 	}
226 	if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
227 		mlxcx_wq_rele_dma(mlxp, mlwq);
228 	}
229 	mlcq = mlwq->mlwq_cq;
230 
231 	/* These will be released by mlxcx_teardown_bufs() */
232 	mlwq->mlwq_bufs = NULL;
233 	mlwq->mlwq_foreign_bufs = NULL;
234 
235 	mutex_exit(&mlwq->mlwq_mtx);
236 
237 	mutex_enter(&mlcq->mlcq_mtx);
238 	mutex_enter(&mlwq->mlwq_mtx);
239 	ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
240 	mlcq->mlcq_wq = NULL;
241 	mutex_exit(&mlwq->mlwq_mtx);
242 	mutex_exit(&mlcq->mlcq_mtx);
243 
244 	mutex_destroy(&mlwq->mlwq_mtx);
245 }
246 
247 void
248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
249 {
250 	mlxcx_event_queue_t *mleq;
251 	mlxcx_buffer_t *b;
252 
253 	/*
254 	 * If something is holding the lock on a long operation like polling
255 	 * which we're going to abort anyway, this flag asks them to exit
256 	 * early if possible.
257 	 */
258 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
259 
260 	mutex_enter(&mlcq->mlcq_mtx);
261 
262 	list_remove(&mlxp->mlx_cqs, mlcq);
263 
264 	if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
265 	    !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
266 		if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
267 			mlxcx_warn(mlxp, "failed to destroy "
268 			    "completion queue num %u",
269 			    mlcq->mlcq_num);
270 		}
271 	}
272 	if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
273 		mlxcx_cq_rele_dma(mlxp, mlcq);
274 	}
275 	/*
276 	 * If we're on an EQ AVL tree, then we need to grab
277 	 * the EQ's mutex to take it off. The ISR always takes
278 	 * EQ mutex before CQ mutex, so we have to let go of
279 	 * the CQ mutex then come back again.
280 	 *
281 	 * The ISR will bail out if tries to touch this CQ now since
282 	 * we added the CQ_DESTROYED flag above.
283 	 */
284 	if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
285 		mleq = mlcq->mlcq_eq;
286 	} else {
287 		mleq = NULL;
288 	}
289 
290 	/* Return any outstanding buffers to the free pool. */
291 	while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
292 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
293 	}
294 	mutex_enter(&mlcq->mlcq_bufbmtx);
295 	while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
296 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
297 	}
298 	mutex_exit(&mlcq->mlcq_bufbmtx);
299 
300 	/*
301 	 * Since the interrupt handlers take the EQ lock before the CQ one,
302 	 * we must do the same here. That means letting go of the lock
303 	 * for a brief window here (we'll double-check the state when we
304 	 * get back in).
305 	 */
306 	mutex_exit(&mlcq->mlcq_mtx);
307 
308 	if (mleq != NULL) {
309 		mutex_enter(&mleq->mleq_mtx);
310 		mutex_enter(&mlcq->mlcq_mtx);
311 		/*
312 		 * Double-check the state, we let go of the
313 		 * mutex briefly.
314 		 */
315 		if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
316 			avl_remove(&mleq->mleq_cqs, mlcq);
317 			mlcq->mlcq_state &= ~MLXCX_CQ_EQAVL;
318 		}
319 		mutex_exit(&mlcq->mlcq_mtx);
320 		mutex_exit(&mleq->mleq_mtx);
321 	}
322 
323 	mutex_enter(&mlcq->mlcq_mtx);
324 	ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
325 	    MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
326 	mutex_exit(&mlcq->mlcq_mtx);
327 
328 	mutex_destroy(&mlcq->mlcq_mtx);
329 	mutex_destroy(&mlcq->mlcq_bufbmtx);
330 	list_destroy(&mlcq->mlcq_buffers);
331 	list_destroy(&mlcq->mlcq_buffers_b);
332 	kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
333 }
334 
335 static boolean_t
336 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
337     mlxcx_completion_queue_t **cqp, uint_t ent_shift)
338 {
339 	mlxcx_completion_queue_t *cq;
340 
341 	cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
342 	mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
343 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
344 	mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
345 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
346 	list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
347 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
348 	list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
349 	    offsetof(mlxcx_buffer_t, mlb_cq_entry));
350 
351 	cq->mlcq_mlx = mlxp;
352 	list_insert_tail(&mlxp->mlx_cqs, cq);
353 
354 	mutex_enter(&cq->mlcq_mtx);
355 
356 	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
357 		mutex_exit(&cq->mlcq_mtx);
358 		return (B_FALSE);
359 	}
360 
361 	cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
362 	cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
363 
364 	cq->mlcq_uar = &mlxp->mlx_uar;
365 	cq->mlcq_eq = eq;
366 
367 	cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
368 	cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
369 
370 	if (!mlxcx_cmd_create_cq(mlxp, cq)) {
371 		mutex_exit(&cq->mlcq_mtx);
372 		return (B_FALSE);
373 	}
374 
375 	mutex_exit(&cq->mlcq_mtx);
376 
377 	mutex_enter(&eq->mleq_mtx);
378 	mutex_enter(&cq->mlcq_mtx);
379 	ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
380 	avl_add(&eq->mleq_cqs, cq);
381 	cq->mlcq_state |= MLXCX_CQ_EQAVL;
382 	mlxcx_arm_cq(mlxp, cq);
383 	mutex_exit(&cq->mlcq_mtx);
384 	mutex_exit(&eq->mleq_mtx);
385 
386 	*cqp = cq;
387 	return (B_TRUE);
388 }
389 
390 static boolean_t
391 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
392     mlxcx_work_queue_t *wq)
393 {
394 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
395 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
396 
397 	list_insert_tail(&mlxp->mlx_wqs, wq);
398 
399 	mutex_enter(&wq->mlwq_mtx);
400 
401 	wq->mlwq_mlx = mlxp;
402 	wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
403 	wq->mlwq_cq = cq;
404 	wq->mlwq_pd = &mlxp->mlx_pd;
405 	wq->mlwq_uar = &mlxp->mlx_uar;
406 
407 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
408 
409 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
410 		mutex_exit(&wq->mlwq_mtx);
411 		return (B_FALSE);
412 	}
413 
414 	if (!mlxcx_cmd_create_rq(mlxp, wq)) {
415 		mutex_exit(&wq->mlwq_mtx);
416 		return (B_FALSE);
417 	}
418 
419 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
420 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
421 
422 	mutex_exit(&wq->mlwq_mtx);
423 
424 	mutex_enter(&cq->mlcq_mtx);
425 	mutex_enter(&wq->mlwq_mtx);
426 	ASSERT3P(cq->mlcq_wq, ==, NULL);
427 	cq->mlcq_wq = wq;
428 	mutex_exit(&wq->mlwq_mtx);
429 	mutex_exit(&cq->mlcq_mtx);
430 
431 	return (B_TRUE);
432 }
433 
434 static boolean_t
435 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
436     mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
437 {
438 	mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
439 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
440 
441 	list_insert_tail(&mlxp->mlx_wqs, wq);
442 
443 	mutex_enter(&wq->mlwq_mtx);
444 
445 	wq->mlwq_mlx = mlxp;
446 	wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
447 	wq->mlwq_cq = cq;
448 	wq->mlwq_pd = &mlxp->mlx_pd;
449 	wq->mlwq_uar = &mlxp->mlx_uar;
450 	wq->mlwq_tis = tis;
451 
452 	wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
453 	wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
454 
455 	VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
456 	wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
457 
458 	if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
459 		mutex_exit(&wq->mlwq_mtx);
460 		return (B_FALSE);
461 	}
462 
463 	if (!mlxcx_cmd_create_sq(mlxp, wq)) {
464 		mutex_exit(&wq->mlwq_mtx);
465 		return (B_FALSE);
466 	}
467 
468 	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
469 	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
470 
471 	mutex_exit(&wq->mlwq_mtx);
472 
473 	mutex_enter(&cq->mlcq_mtx);
474 	mutex_enter(&wq->mlwq_mtx);
475 	ASSERT3P(cq->mlcq_wq, ==, NULL);
476 	cq->mlcq_wq = wq;
477 	mutex_exit(&wq->mlwq_mtx);
478 	mutex_exit(&cq->mlcq_mtx);
479 
480 	return (B_TRUE);
481 }
482 
483 /*
484  * Before we tear down the queues associated with the rx group,
485  * flag each cq as being torn down and wake up any tasks.
486  */
487 static void
488 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
489 {
490 	mlxcx_work_queue_t *wq;
491 	mlxcx_completion_queue_t *cq;
492 	mlxcx_buf_shard_t *s;
493 	uint_t i;
494 
495 	mutex_enter(&g->mlg_mtx);
496 
497 	for (i = 0; i < g->mlg_nwqs; ++i) {
498 		wq = &g->mlg_wqs[i];
499 		cq = wq->mlwq_cq;
500 		if (cq != NULL) {
501 			s = wq->mlwq_bufs;
502 			mutex_enter(&s->mlbs_mtx);
503 			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
504 			cv_broadcast(&s->mlbs_free_nonempty);
505 			mutex_exit(&s->mlbs_mtx);
506 		}
507 	}
508 
509 	mutex_exit(&g->mlg_mtx);
510 }
511 
512 void
513 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
514 {
515 	mlxcx_work_queue_t *wq;
516 	mlxcx_completion_queue_t *cq;
517 	mlxcx_flow_entry_t *fe;
518 	mlxcx_flow_group_t *fg;
519 	mlxcx_flow_table_t *ft;
520 	uint_t i;
521 
522 	mutex_enter(&g->mlg_port->mlp_mtx);
523 	mutex_enter(&g->mlg_mtx);
524 
525 	if (g->mlg_state & MLXCX_GROUP_FLOWS) {
526 		mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
527 
528 		if (g->mlg_rx_vlan_ft != NULL)
529 			mlxcx_remove_all_vlan_entries(mlxp, g);
530 
531 		if (g == &mlxp->mlx_rx_groups[0]) {
532 			ft = g->mlg_port->mlp_rx_flow;
533 			mutex_enter(&ft->mlft_mtx);
534 
535 			fg = g->mlg_port->mlp_bcast;
536 			fe = list_head(&fg->mlfg_entries);
537 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
538 				(void) mlxcx_cmd_delete_flow_table_entry(
539 				    mlxp, fe);
540 			}
541 
542 			fg = g->mlg_port->mlp_promisc;
543 			fe = list_head(&fg->mlfg_entries);
544 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
545 				(void) mlxcx_cmd_delete_flow_table_entry(
546 				    mlxp, fe);
547 			}
548 
549 			mutex_exit(&ft->mlft_mtx);
550 		}
551 
552 		if (g->mlg_rx_vlan_ft != NULL) {
553 			mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
554 			ASSERT(list_is_empty(&g->mlg_rx_vlans));
555 			fg = g->mlg_rx_vlan_def_fg;
556 			fe = list_head(&fg->mlfg_entries);
557 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
558 				(void) mlxcx_cmd_delete_flow_table_entry(
559 				    mlxp, fe);
560 			}
561 			fg = g->mlg_rx_vlan_promisc_fg;
562 			fe = list_head(&fg->mlfg_entries);
563 			if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
564 				(void) mlxcx_cmd_delete_flow_table_entry(
565 				    mlxp, fe);
566 			}
567 			mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
568 			list_destroy(&g->mlg_rx_vlans);
569 
570 			g->mlg_rx_vlan_ft = NULL;
571 		}
572 
573 		mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
574 		mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
575 		g->mlg_rx_hash_ft = NULL;
576 
577 		avl_destroy(&g->mlg_rx_macs);
578 		g->mlg_state &= ~MLXCX_GROUP_FLOWS;
579 	}
580 
581 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
582 		for (i = 0; i < g->mlg_nwqs; ++i) {
583 			wq = &g->mlg_wqs[i];
584 			mutex_enter(&wq->mlwq_mtx);
585 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
586 			    !mlxcx_cmd_stop_rq(mlxp, wq)) {
587 				mlxcx_warn(mlxp, "failed to stop rq %x",
588 				    wq->mlwq_num);
589 			}
590 			mutex_exit(&wq->mlwq_mtx);
591 		}
592 		taskq_destroy(g->mlg_refill_tq);
593 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
594 	}
595 
596 	if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
597 		for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
598 			mlxcx_tir_t *tir = &g->mlg_tir[i];
599 			if (tir->mltir_state & MLXCX_TIR_CREATED &&
600 			    !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
601 				if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
602 					mlxcx_warn(mlxp,
603 					    "failed to destroy tir %u "
604 					    "for rx ring", tir->mltir_num);
605 				}
606 			}
607 		}
608 		g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
609 	}
610 
611 	if (g->mlg_state & MLXCX_GROUP_RQT) {
612 		if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
613 		    !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
614 			if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
615 				mlxcx_warn(mlxp, "failed to destroy rqt %u "
616 				    "for rx ring", g->mlg_rqt->mlrqt_num);
617 			}
618 			kmem_free(g->mlg_rqt->mlrqt_rq,
619 			    g->mlg_rqt->mlrqt_rq_size);
620 			g->mlg_rqt->mlrqt_rq = NULL;
621 			kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
622 			g->mlg_rqt = NULL;
623 		}
624 		g->mlg_state &= ~MLXCX_GROUP_RQT;
625 	}
626 
627 	for (i = 0; i < g->mlg_nwqs; ++i) {
628 		wq = &g->mlg_wqs[i];
629 		cq = wq->mlwq_cq;
630 		mlxcx_wq_teardown(mlxp, wq);
631 		if (cq != NULL)
632 			mlxcx_cq_teardown(mlxp, cq);
633 	}
634 	kmem_free(g->mlg_wqs, g->mlg_wqs_size);
635 	g->mlg_wqs = NULL;
636 	g->mlg_state &= ~MLXCX_GROUP_WQS;
637 
638 	mutex_exit(&g->mlg_mtx);
639 	mutex_exit(&g->mlg_port->mlp_mtx);
640 
641 	mutex_destroy(&g->mlg_mtx);
642 
643 	g->mlg_state &= ~MLXCX_GROUP_INIT;
644 	ASSERT3S(g->mlg_state, ==, 0);
645 }
646 
647 void
648 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
649 {
650 	mlxcx_work_queue_t *wq;
651 	mlxcx_completion_queue_t *cq;
652 	uint_t i;
653 
654 	mutex_enter(&g->mlg_mtx);
655 
656 	if (g->mlg_state & MLXCX_GROUP_WQS) {
657 		for (i = 0; i < g->mlg_nwqs; ++i) {
658 			wq = &g->mlg_wqs[i];
659 			mutex_enter(&wq->mlwq_mtx);
660 			cq = wq->mlwq_cq;
661 			if (wq->mlwq_state & MLXCX_WQ_STARTED &&
662 			    !mlxcx_cmd_stop_sq(mlxp, wq)) {
663 				mlxcx_warn(mlxp, "failed to stop sq %x",
664 				    wq->mlwq_num);
665 			}
666 			mutex_exit(&wq->mlwq_mtx);
667 			mlxcx_wq_teardown(mlxp, wq);
668 			if (cq != NULL)
669 				mlxcx_cq_teardown(mlxp, cq);
670 		}
671 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
672 		kmem_free(g->mlg_wqs, g->mlg_wqs_size);
673 		g->mlg_wqs = NULL;
674 		g->mlg_state &= ~MLXCX_GROUP_WQS;
675 	}
676 
677 	if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
678 	    g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
679 	    !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
680 		if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
681 			mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
682 			    g->mlg_tis.mltis_num);
683 		}
684 	}
685 	g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
686 
687 	mutex_exit(&g->mlg_mtx);
688 	mutex_destroy(&g->mlg_mtx);
689 	g->mlg_state &= ~MLXCX_GROUP_INIT;
690 	ASSERT3S(g->mlg_state, ==, 0);
691 }
692 
693 void
694 mlxcx_teardown_groups(mlxcx_t *mlxp)
695 {
696 	mlxcx_ring_group_t *g;
697 	uint_t i;
698 
699 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
700 		g = &mlxp->mlx_rx_groups[i];
701 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
702 			continue;
703 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
704 		mlxcx_quiesce_rx_cqs(mlxp, g);
705 	}
706 
707 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
708 		g = &mlxp->mlx_rx_groups[i];
709 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
710 			continue;
711 		mlxcx_teardown_rx_group(mlxp, g);
712 	}
713 
714 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
715 	mlxp->mlx_rx_groups = NULL;
716 
717 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
718 		g = &mlxp->mlx_tx_groups[i];
719 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
720 			continue;
721 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
722 		mlxcx_teardown_tx_group(mlxp, g);
723 	}
724 
725 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
726 	mlxp->mlx_tx_groups = NULL;
727 }
728 
729 boolean_t
730 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
731 {
732 	mlxcx_event_queue_t *eq;
733 	mlxcx_completion_queue_t *cq;
734 	mlxcx_work_queue_t *rq;
735 	mlxcx_flow_table_t *ft;
736 	mlxcx_flow_group_t *fg;
737 	mlxcx_flow_entry_t *fe;
738 	uint_t ent_shift;
739 	uint_t i, j;
740 
741 	ASSERT3S(g->mlg_state, ==, 0);
742 
743 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
744 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
745 	mutex_enter(&g->mlg_mtx);
746 	g->mlg_mlx = mlxp;
747 	g->mlg_type = MLXCX_GROUP_RX;
748 	g->mlg_port = &mlxp->mlx_ports[0];
749 	g->mlg_state |= MLXCX_GROUP_INIT;
750 
751 	g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
752 	i = g - &mlxp->mlx_rx_groups[0];
753 	if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
754 		g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
755 
756 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
757 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
758 	g->mlg_state |= MLXCX_GROUP_WQS;
759 
760 	g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
761 	g->mlg_rqt->mlrqt_max = 2;
762 	while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
763 		g->mlg_rqt->mlrqt_max <<= 1;
764 	g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
765 	    sizeof (mlxcx_work_queue_t *);
766 	g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
767 	g->mlg_state |= MLXCX_GROUP_RQT;
768 
769 	for (i = 0; i < g->mlg_nwqs; ++i) {
770 		eq = NULL;
771 		while (eq == NULL) {
772 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
773 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
774 				mlxp->mlx_next_eq = 1;
775 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
776 			    eq->mleq_type != MLXCX_EQ_TYPE_RX) {
777 				/* Try the next one */
778 				eq = NULL;
779 			}
780 		}
781 
782 		/*
783 		 * A single completion is indicated for each rq entry as
784 		 * it is used. So, the number of cq entries never needs
785 		 * to be larger than the rq.
786 		 */
787 		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
788 		    mlxp->mlx_props.mldp_rq_size_shift);
789 		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
790 			g->mlg_nwqs = i;
791 			break;
792 		}
793 
794 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
795 
796 		rq = &g->mlg_wqs[i];
797 		if (!mlxcx_rq_setup(mlxp, cq, rq)) {
798 			g->mlg_nwqs = i;
799 			break;
800 		}
801 		g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
802 		g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
803 		rq->mlwq_group = g;
804 	}
805 	if (g->mlg_nwqs == 0) {
806 		mutex_exit(&g->mlg_mtx);
807 		return (B_FALSE);
808 	}
809 
810 	if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
811 		mutex_exit(&g->mlg_mtx);
812 		return (B_FALSE);
813 	}
814 
815 	for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
816 		mlxcx_tir_t *tir = &g->mlg_tir[i];
817 		tir->mltir_tdom = &mlxp->mlx_tdom;
818 		switch (i) {
819 		case MLXCX_TIR_ROLE_OTHER:
820 			tir->mltir_type = MLXCX_TIR_DIRECT;
821 			tir->mltir_rq = &g->mlg_wqs[0];
822 			break;
823 		case MLXCX_TIR_ROLE_IPv4:
824 		case MLXCX_TIR_ROLE_IPv6:
825 		case MLXCX_TIR_ROLE_TCPv4:
826 		case MLXCX_TIR_ROLE_TCPv6:
827 		case MLXCX_TIR_ROLE_UDPv4:
828 		case MLXCX_TIR_ROLE_UDPv6:
829 			tir->mltir_type = MLXCX_TIR_INDIRECT;
830 			tir->mltir_rqtable = g->mlg_rqt;
831 			tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
832 			(void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
833 			    sizeof (tir->mltir_toeplitz_key));
834 			break;
835 		}
836 		switch (i) {
837 		case MLXCX_TIR_ROLE_OTHER:
838 			break;
839 		case MLXCX_TIR_ROLE_IPv4:
840 		case MLXCX_TIR_ROLE_TCPv4:
841 		case MLXCX_TIR_ROLE_UDPv4:
842 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
843 			tir->mltir_hash_fields =
844 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
845 			break;
846 		case MLXCX_TIR_ROLE_IPv6:
847 		case MLXCX_TIR_ROLE_TCPv6:
848 		case MLXCX_TIR_ROLE_UDPv6:
849 			tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
850 			tir->mltir_hash_fields =
851 			    MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
852 			break;
853 		}
854 		switch (i) {
855 		case MLXCX_TIR_ROLE_OTHER:
856 		case MLXCX_TIR_ROLE_IPv4:
857 		case MLXCX_TIR_ROLE_IPv6:
858 			break;
859 		case MLXCX_TIR_ROLE_TCPv4:
860 		case MLXCX_TIR_ROLE_TCPv6:
861 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
862 			tir->mltir_hash_fields |=
863 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
864 			break;
865 		case MLXCX_TIR_ROLE_UDPv4:
866 		case MLXCX_TIR_ROLE_UDPv6:
867 			tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
868 			tir->mltir_hash_fields |=
869 			    MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
870 			break;
871 		}
872 
873 		if (!mlxcx_cmd_create_tir(mlxp, tir)) {
874 			mutex_exit(&g->mlg_mtx);
875 			return (B_FALSE);
876 		}
877 
878 		g->mlg_state |= MLXCX_GROUP_TIRTIS;
879 	}
880 
881 	/*
882 	 * Flow table: our RX hashing breakout table for RSS
883 	 */
884 
885 	g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
886 	    KM_SLEEP));
887 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
888 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
889 	avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
890 	    sizeof (mlxcx_group_mac_t),
891 	    offsetof(mlxcx_group_mac_t, mlgm_group_entry));
892 	g->mlg_state |= MLXCX_GROUP_FLOWS;
893 
894 	mutex_enter(&ft->mlft_mtx);
895 
896 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
897 	ft->mlft_level = 2;
898 	ft->mlft_port = g->mlg_port;
899 	ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
900 	ft->mlft_nents = (1 << ft->mlft_entshift);
901 	ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
902 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
903 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
904 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
905 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
906 
907 	for (j = 0; j < ft->mlft_nents; ++j) {
908 		ft->mlft_ent[j].mlfe_table = ft;
909 		ft->mlft_ent[j].mlfe_index = j;
910 	}
911 
912 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
913 		mutex_exit(&ft->mlft_mtx);
914 		mutex_exit(&g->mlg_mtx);
915 		return (B_FALSE);
916 	}
917 
918 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
919 	list_insert_tail(&ft->mlft_groups, fg);
920 	fg->mlfg_table = ft;
921 	fg->mlfg_size = 1;
922 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
923 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
924 		mutex_exit(&ft->mlft_mtx);
925 		mutex_exit(&g->mlg_mtx);
926 		return (B_FALSE);
927 	}
928 	fe = list_head(&fg->mlfg_entries);
929 	fe->mlfe_ip_version = 6;
930 	fe->mlfe_ip_proto = IPPROTO_UDP;
931 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
932 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
933 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
934 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
935 		mutex_exit(&ft->mlft_mtx);
936 		mutex_exit(&g->mlg_mtx);
937 		return (B_FALSE);
938 	}
939 
940 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
941 	list_insert_tail(&ft->mlft_groups, fg);
942 	fg->mlfg_table = ft;
943 	fg->mlfg_size = 1;
944 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
945 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
946 		mutex_exit(&ft->mlft_mtx);
947 		mutex_exit(&g->mlg_mtx);
948 		return (B_FALSE);
949 	}
950 	fe = list_head(&fg->mlfg_entries);
951 	fe->mlfe_ip_version = 4;
952 	fe->mlfe_ip_proto = IPPROTO_UDP;
953 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
954 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
955 	    &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
956 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
957 		mutex_exit(&ft->mlft_mtx);
958 		mutex_exit(&g->mlg_mtx);
959 		return (B_FALSE);
960 	}
961 
962 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
963 	list_insert_tail(&ft->mlft_groups, fg);
964 	fg->mlfg_table = ft;
965 	fg->mlfg_size = 1;
966 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
967 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
968 		mutex_exit(&ft->mlft_mtx);
969 		mutex_exit(&g->mlg_mtx);
970 		return (B_FALSE);
971 	}
972 	fe = list_head(&fg->mlfg_entries);
973 	fe->mlfe_ip_version = 6;
974 	fe->mlfe_ip_proto = IPPROTO_TCP;
975 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
976 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
977 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
978 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
979 		mutex_exit(&ft->mlft_mtx);
980 		mutex_exit(&g->mlg_mtx);
981 		return (B_FALSE);
982 	}
983 
984 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
985 	list_insert_tail(&ft->mlft_groups, fg);
986 	fg->mlfg_table = ft;
987 	fg->mlfg_size = 1;
988 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
989 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
990 		mutex_exit(&ft->mlft_mtx);
991 		mutex_exit(&g->mlg_mtx);
992 		return (B_FALSE);
993 	}
994 	fe = list_head(&fg->mlfg_entries);
995 	fe->mlfe_ip_version = 4;
996 	fe->mlfe_ip_proto = IPPROTO_TCP;
997 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
998 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
999 	    &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
1000 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1001 		mutex_exit(&ft->mlft_mtx);
1002 		mutex_exit(&g->mlg_mtx);
1003 		return (B_FALSE);
1004 	}
1005 
1006 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1007 	list_insert_tail(&ft->mlft_groups, fg);
1008 	fg->mlfg_table = ft;
1009 	fg->mlfg_size = 1;
1010 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1011 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1012 		mutex_exit(&ft->mlft_mtx);
1013 		mutex_exit(&g->mlg_mtx);
1014 		return (B_FALSE);
1015 	}
1016 	fe = list_head(&fg->mlfg_entries);
1017 	fe->mlfe_ip_version = 6;
1018 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1019 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1020 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
1021 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1022 		mutex_exit(&ft->mlft_mtx);
1023 		mutex_exit(&g->mlg_mtx);
1024 		return (B_FALSE);
1025 	}
1026 
1027 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1028 	list_insert_tail(&ft->mlft_groups, fg);
1029 	fg->mlfg_table = ft;
1030 	fg->mlfg_size = 1;
1031 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1032 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1033 		mutex_exit(&ft->mlft_mtx);
1034 		mutex_exit(&g->mlg_mtx);
1035 		return (B_FALSE);
1036 	}
1037 	fe = list_head(&fg->mlfg_entries);
1038 	fe->mlfe_ip_version = 4;
1039 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1040 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1041 	    &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
1042 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1043 		mutex_exit(&ft->mlft_mtx);
1044 		mutex_exit(&g->mlg_mtx);
1045 		return (B_FALSE);
1046 	}
1047 
1048 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1049 	list_insert_tail(&ft->mlft_groups, fg);
1050 	fg->mlfg_table = ft;
1051 	fg->mlfg_size = 1;
1052 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1053 		mutex_exit(&ft->mlft_mtx);
1054 		mutex_exit(&g->mlg_mtx);
1055 		return (B_FALSE);
1056 	}
1057 	fe = list_head(&fg->mlfg_entries);
1058 	fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1059 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1060 	    &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1061 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1062 		mutex_exit(&ft->mlft_mtx);
1063 		mutex_exit(&g->mlg_mtx);
1064 		return (B_FALSE);
1065 	}
1066 
1067 	mutex_exit(&ft->mlft_mtx);
1068 
1069 	/*
1070 	 * Flow table: the VLAN breakout table for doing VLAN filtering after
1071 	 * we've matched a MAC address.
1072 	 */
1073 
1074 	g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1075 	    KM_SLEEP));
1076 	mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1077 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1078 	list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1079 	    offsetof(mlxcx_group_vlan_t, mlgv_entry));
1080 
1081 	mutex_enter(&ft->mlft_mtx);
1082 
1083 	ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1084 	ft->mlft_level = 1;
1085 	ft->mlft_port = g->mlg_port;
1086 	ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1087 	ft->mlft_nents = (1 << ft->mlft_entshift);
1088 	ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1089 	ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1090 	list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1091 	    offsetof(mlxcx_flow_group_t, mlfg_entry));
1092 
1093 	for (j = 0; j < ft->mlft_nents; ++j) {
1094 		fe = &ft->mlft_ent[j];
1095 		fe->mlfe_table = ft;
1096 		fe->mlfe_index = j;
1097 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1098 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1099 	}
1100 
1101 	if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1102 		mutex_exit(&ft->mlft_mtx);
1103 		mutex_exit(&g->mlg_mtx);
1104 		return (B_FALSE);
1105 	}
1106 
1107 	/* First group is all actual matched VLANs */
1108 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1109 	g->mlg_rx_vlan_fg = fg;
1110 	list_insert_tail(&ft->mlft_groups, fg);
1111 	fg->mlfg_table = ft;
1112 	fg->mlfg_size = ft->mlft_nents - 2;
1113 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1114 	fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1115 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1116 		mutex_exit(&ft->mlft_mtx);
1117 		mutex_exit(&g->mlg_mtx);
1118 		return (B_FALSE);
1119 	}
1120 
1121 	/*
1122 	 * Then the "default" entry which we enable when we have no VLAN IDs
1123 	 * added to the group (we start with this enabled).
1124 	 */
1125 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1126 	g->mlg_rx_vlan_def_fg = fg;
1127 	list_insert_tail(&ft->mlft_groups, fg);
1128 	fg->mlfg_table = ft;
1129 	fg->mlfg_size = 1;
1130 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1131 		mutex_exit(&ft->mlft_mtx);
1132 		mutex_exit(&g->mlg_mtx);
1133 		return (B_FALSE);
1134 	}
1135 	fe = list_head(&fg->mlfg_entries);
1136 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1137 		mutex_exit(&ft->mlft_mtx);
1138 		mutex_exit(&g->mlg_mtx);
1139 		return (B_FALSE);
1140 	}
1141 
1142 	/*
1143 	 * Finally, the promisc entry which points at the *hash ft* from the
1144 	 * default group. We only enable this when we have promisc on.
1145 	 */
1146 	fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1147 	g->mlg_rx_vlan_promisc_fg = fg;
1148 	list_insert_tail(&ft->mlft_groups, fg);
1149 	fg->mlfg_table = ft;
1150 	fg->mlfg_size = 1;
1151 	if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1152 		mutex_exit(&ft->mlft_mtx);
1153 		mutex_exit(&g->mlg_mtx);
1154 		return (B_FALSE);
1155 	}
1156 	fe = list_head(&fg->mlfg_entries);
1157 	fe->mlfe_ndest = 1;
1158 	fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1159 
1160 	mutex_exit(&ft->mlft_mtx);
1161 
1162 	mutex_exit(&g->mlg_mtx);
1163 
1164 	return (B_TRUE);
1165 }
1166 
1167 boolean_t
1168 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1169     mlxcx_work_queue_t *rq)
1170 {
1171 	uint_t j;
1172 	mlxcx_buffer_t *b;
1173 	mlxcx_completion_queue_t *cq;
1174 
1175 	mutex_enter(&g->mlg_mtx);
1176 	/*
1177 	 * Sadly, even though MAC has the mgi_start callback, it is not always
1178 	 * called -- in particular when we are being managed under an aggr, the
1179 	 * mgi_start callback will only ever be called on the default group.
1180 	 *
1181 	 * So instead of asserting about the group state here, we have to
1182 	 * check it and call group start if needed.
1183 	 */
1184 	if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1185 		mutex_exit(&g->mlg_mtx);
1186 		if (!mlxcx_rx_group_start(mlxp, g))
1187 			return (B_FALSE);
1188 		mutex_enter(&g->mlg_mtx);
1189 	}
1190 	ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1191 
1192 	cq = rq->mlwq_cq;
1193 	ASSERT(cq != NULL);
1194 
1195 	mutex_enter(&cq->mlcq_mtx);
1196 	mutex_enter(&rq->mlwq_mtx);
1197 
1198 	if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1199 		mutex_exit(&rq->mlwq_mtx);
1200 		mutex_exit(&cq->mlcq_mtx);
1201 		mutex_exit(&g->mlg_mtx);
1202 		return (B_TRUE);
1203 	}
1204 
1205 	if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1206 		mutex_exit(&rq->mlwq_mtx);
1207 		mutex_exit(&cq->mlcq_mtx);
1208 		mutex_exit(&g->mlg_mtx);
1209 		return (B_FALSE);
1210 	}
1211 	ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1212 
1213 	ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1214 	rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1215 
1216 	mlxcx_shard_ready(rq->mlwq_bufs);
1217 
1218 	for (j = 0; j < rq->mlwq_nents; ++j) {
1219 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1220 			break;
1221 		mlxcx_buf_return(mlxp, b);
1222 	}
1223 	for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1224 		if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1225 			break;
1226 		mlxcx_buf_return(mlxp, b);
1227 	}
1228 
1229 	mlxcx_rq_refill(mlxp, rq);
1230 
1231 	mutex_exit(&rq->mlwq_mtx);
1232 	mutex_exit(&cq->mlcq_mtx);
1233 	mutex_exit(&g->mlg_mtx);
1234 
1235 	return (B_TRUE);
1236 }
1237 
1238 boolean_t
1239 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1240 {
1241 	mlxcx_flow_table_t *ft;
1242 	mlxcx_flow_group_t *fg;
1243 	mlxcx_flow_entry_t *fe;
1244 	char tq_name[TASKQ_NAMELEN];
1245 
1246 	mutex_enter(&g->mlg_mtx);
1247 
1248 	if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1249 		mutex_exit(&g->mlg_mtx);
1250 		return (B_TRUE);
1251 	}
1252 
1253 	ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1254 
1255 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1256 
1257 	(void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
1258 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
1259 	    g - &mlxp->mlx_rx_groups[0]);
1260 
1261 	/*
1262 	 * Create one refill taskq per group with one thread per work queue.
1263 	 * The refill task may block waiting for resources, so by effectively
1264 	 * having one thread per work queue we avoid work queues blocking each
1265 	 * other.
1266 	 */
1267 	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
1268 	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
1269 		mlxcx_warn(mlxp, "failed to create rq refill task queue");
1270 		mutex_exit(&g->mlg_mtx);
1271 		return (B_FALSE);
1272 	}
1273 
1274 	if (g == &mlxp->mlx_rx_groups[0]) {
1275 		ft = g->mlg_port->mlp_rx_flow;
1276 		mutex_enter(&ft->mlft_mtx);
1277 
1278 		/*
1279 		 * Broadcast and promisc entries go directly to group 0's
1280 		 * RSS hash fanout flow table. They bypass VLAN filtering.
1281 		 */
1282 		fg = g->mlg_port->mlp_bcast;
1283 		fe = list_head(&fg->mlfg_entries);
1284 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1285 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1286 			mutex_exit(&ft->mlft_mtx);
1287 			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
1288 			taskq_destroy(g->mlg_refill_tq);
1289 			mutex_exit(&g->mlg_mtx);
1290 			return (B_FALSE);
1291 		}
1292 
1293 		fg = g->mlg_port->mlp_promisc;
1294 		fe = list_head(&fg->mlfg_entries);
1295 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1296 		/*
1297 		 * Don't actually set the promisc entry until promisc is
1298 		 * enabled.
1299 		 */
1300 
1301 		mutex_exit(&ft->mlft_mtx);
1302 	}
1303 
1304 	mutex_exit(&g->mlg_mtx);
1305 
1306 	return (B_TRUE);
1307 }
1308 
1309 boolean_t
1310 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1311 {
1312 	mlxcx_event_queue_t *eq;
1313 	mlxcx_completion_queue_t *cq;
1314 	mlxcx_work_queue_t *sq;
1315 	uint_t i;
1316 
1317 	ASSERT3S(g->mlg_state, ==, 0);
1318 
1319 	mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1320 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1321 	g->mlg_state |= MLXCX_GROUP_INIT;
1322 	mutex_enter(&g->mlg_mtx);
1323 
1324 	g->mlg_mlx = mlxp;
1325 	g->mlg_type = MLXCX_GROUP_TX;
1326 	g->mlg_port = &mlxp->mlx_ports[0];
1327 
1328 	g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1329 	g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1330 	g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1331 	g->mlg_state |= MLXCX_GROUP_WQS;
1332 
1333 	g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1334 
1335 	if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1336 		mutex_exit(&g->mlg_mtx);
1337 		return (B_FALSE);
1338 	}
1339 
1340 	g->mlg_state |= MLXCX_GROUP_TIRTIS;
1341 
1342 	for (i = 0; i < g->mlg_nwqs; ++i) {
1343 		eq = NULL;
1344 		while (eq == NULL) {
1345 			eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1346 			if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1347 				mlxp->mlx_next_eq = 1;
1348 			if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1349 			    eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1350 				/* Try the next one */
1351 				eq = NULL;
1352 			}
1353 		}
1354 
1355 		if (!mlxcx_cq_setup(mlxp, eq, &cq,
1356 		    mlxp->mlx_props.mldp_cq_size_shift))
1357 			return (B_FALSE);
1358 
1359 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
1360 
1361 		sq = &g->mlg_wqs[i];
1362 		if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1363 			mutex_exit(&g->mlg_mtx);
1364 			return (B_FALSE);
1365 		}
1366 		sq->mlwq_group = g;
1367 	}
1368 
1369 	mutex_exit(&g->mlg_mtx);
1370 
1371 	return (B_TRUE);
1372 }
1373 
1374 boolean_t
1375 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1376     mlxcx_work_queue_t *sq)
1377 {
1378 	uint_t i;
1379 	mlxcx_buffer_t *b;
1380 	mlxcx_completion_queue_t *cq;
1381 
1382 	mutex_enter(&g->mlg_mtx);
1383 
1384 	cq = sq->mlwq_cq;
1385 	ASSERT(cq != NULL);
1386 
1387 	mutex_enter(&cq->mlcq_mtx);
1388 	mutex_enter(&sq->mlwq_mtx);
1389 	if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1390 		mutex_exit(&sq->mlwq_mtx);
1391 		mutex_exit(&cq->mlcq_mtx);
1392 		mutex_exit(&g->mlg_mtx);
1393 		return (B_TRUE);
1394 	}
1395 
1396 	ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1397 	for (i = 0; i < sq->mlwq_nents; ++i) {
1398 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1399 			break;
1400 		mlxcx_buf_return(mlxp, b);
1401 	}
1402 	for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1403 		if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1404 			break;
1405 		mlxcx_buf_return(mlxp, b);
1406 	}
1407 	for (i = 0; i < sq->mlwq_nents; ++i) {
1408 		if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1409 			break;
1410 		mlxcx_buf_return(mlxp, b);
1411 	}
1412 	sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1413 
1414 	mlxcx_shard_ready(sq->mlwq_bufs);
1415 	mlxcx_shard_ready(sq->mlwq_foreign_bufs);
1416 
1417 	if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1418 		mutex_exit(&sq->mlwq_mtx);
1419 		mutex_exit(&cq->mlcq_mtx);
1420 		mutex_exit(&g->mlg_mtx);
1421 		return (B_FALSE);
1422 	}
1423 	g->mlg_state |= MLXCX_GROUP_RUNNING;
1424 
1425 	(void) mlxcx_sq_add_nop(mlxp, sq);
1426 
1427 	mutex_exit(&sq->mlwq_mtx);
1428 	mutex_exit(&cq->mlcq_mtx);
1429 	mutex_exit(&g->mlg_mtx);
1430 
1431 	return (B_TRUE);
1432 }
1433 
1434 static boolean_t
1435 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1436 {
1437 	uint_t idx;
1438 	mlxcx_bf_t *bf;
1439 	ddi_fm_error_t err;
1440 	uint_t try = 0;
1441 
1442 	ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1443 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1444 
1445 	mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1446 
1447 	ASSERT(mlwq->mlwq_cq != NULL);
1448 	ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1449 	idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1450 	bf = &mlwq->mlwq_uar->mlu_bf[idx];
1451 
1452 retry:
1453 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1454 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1455 	    DDI_FME_VERSION);
1456 	if (err.fme_status != DDI_FM_OK) {
1457 		if (try++ < mlxcx_doorbell_tries) {
1458 			ddi_fm_dma_err_clear(
1459 			    mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1460 			    DDI_FME_VERSION);
1461 			goto retry;
1462 		} else {
1463 			goto err;
1464 		}
1465 	}
1466 
1467 	mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1468 	    mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1469 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1470 	    DDI_FME_VERSION);
1471 	if (err.fme_status == DDI_FM_OK)
1472 		return (B_TRUE);
1473 	if (try++ < mlxcx_doorbell_tries) {
1474 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1475 		goto retry;
1476 	}
1477 
1478 err:
1479 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1480 	return (B_FALSE);
1481 }
1482 
1483 boolean_t
1484 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1485 {
1486 	uint_t index, start_pc;
1487 	mlxcx_sendq_ent_t *ent0;
1488 	ddi_fm_error_t err;
1489 
1490 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1491 
1492 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1493 	ent0 = &mlwq->mlwq_send_ent[index];
1494 	start_pc = mlwq->mlwq_pc;
1495 	++mlwq->mlwq_pc;
1496 	/*
1497 	 * This counter is manipulated in the interrupt handler, which
1498 	 * does not hold the mlwq_mtx, hence the atomic.
1499 	 */
1500 	atomic_inc_64(&mlwq->mlwq_wqebb_used);
1501 
1502 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1503 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1504 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1505 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1506 
1507 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1508 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1509 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1510 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1511 
1512 	ent0->mlsqe_control.mlcs_ds = 1;
1513 
1514 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1515 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1516 	    sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1517 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1518 	    DDI_FME_VERSION);
1519 	if (err.fme_status != DDI_FM_OK) {
1520 		return (B_FALSE);
1521 	}
1522 	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1523 		return (B_FALSE);
1524 	}
1525 	return (B_TRUE);
1526 }
1527 
1528 boolean_t
1529 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1530     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1531     mlxcx_buffer_t *b0)
1532 {
1533 	uint_t index, first, ents;
1534 	mlxcx_completion_queue_t *cq;
1535 	mlxcx_sendq_ent_t *ent0;
1536 	mlxcx_sendq_extra_ent_t *ent;
1537 	mlxcx_wqe_data_seg_t *seg;
1538 	uint_t ptri, nptr;
1539 	const ddi_dma_cookie_t *c;
1540 	size_t rem;
1541 	uint64_t wqebb_used;
1542 	mlxcx_buffer_t *b;
1543 	ddi_fm_error_t err;
1544 	boolean_t rv;
1545 
1546 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1547 	ASSERT3P(b0->mlb_tx_head, ==, b0);
1548 	ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1549 	cq = mlwq->mlwq_cq;
1550 
1551 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1552 	ent0 = &mlwq->mlwq_send_ent[index];
1553 	b0->mlb_wqe_index = mlwq->mlwq_pc;
1554 	ents = 1;
1555 
1556 	first = index;
1557 
1558 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1559 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1560 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1561 	ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1562 
1563 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1564 	    MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1565 	set_bits8(&ent0->mlsqe_control.mlcs_flags,
1566 	    MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1567 
1568 	VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1569 	set_bits16(&ent0->mlsqe_eth.mles_szflags,
1570 	    MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1571 	if (inlinelen > 0) {
1572 		bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1573 		    inlinelen);
1574 	}
1575 
1576 	ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) /
1577 	    MLXCX_WQE_OCTOWORD;
1578 
1579 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1580 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1581 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1582 		    MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1583 	}
1584 	if (chkflags & HCK_FULLCKSUM) {
1585 		ASSERT(mlxp->mlx_caps->mlc_checksum);
1586 		set_bit8(&ent0->mlsqe_eth.mles_csflags,
1587 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1588 	}
1589 
1590 	/*
1591 	 * mlwq_wqebb_used is only incremented whilst holding
1592 	 * the mlwq_mtx mutex, but it is decremented (atomically) in
1593 	 * the interrupt context *not* under mlwq_mtx mutex.
1594 	 * So, now take a snapshot of the number of used wqes which will
1595 	 * be a conistent maximum we can use whilst iterating through
1596 	 * the buffers and DMA cookies.
1597 	 */
1598 	wqebb_used = mlwq->mlwq_wqebb_used;
1599 
1600 	b = b0;
1601 	ptri = 0;
1602 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1603 	seg = ent0->mlsqe_data;
1604 	while (b != NULL) {
1605 		rem = b->mlb_used;
1606 
1607 		c = NULL;
1608 		while (rem > 0 &&
1609 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1610 			if (ptri >= nptr) {
1611 				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
1612 					return (B_FALSE);
1613 
1614 				index = (mlwq->mlwq_pc + ents) &
1615 				    (mlwq->mlwq_nents - 1);
1616 				ent = &mlwq->mlwq_send_extra_ent[index];
1617 				++ents;
1618 
1619 				seg = ent->mlsqe_data;
1620 				ptri = 0;
1621 				nptr = sizeof (ent->mlsqe_data) /
1622 				    sizeof (mlxcx_wqe_data_seg_t);
1623 			}
1624 
1625 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1626 			if (c->dmac_size > rem) {
1627 				seg->mlds_byte_count = to_be32(rem);
1628 				rem = 0;
1629 			} else {
1630 				seg->mlds_byte_count = to_be32(c->dmac_size);
1631 				rem -= c->dmac_size;
1632 			}
1633 			seg->mlds_address = to_be64(c->dmac_laddress);
1634 			++seg;
1635 			++ptri;
1636 			++ent0->mlsqe_control.mlcs_ds;
1637 
1638 			ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1639 			    MLXCX_SQE_MAX_DS);
1640 		}
1641 
1642 		if (b == b0) {
1643 			b = list_head(&b0->mlb_tx_chain);
1644 		} else {
1645 			b = list_next(&b0->mlb_tx_chain, b);
1646 		}
1647 	}
1648 
1649 	b0->mlb_wqebbs = ents;
1650 	mlwq->mlwq_pc += ents;
1651 	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
1652 
1653 	for (; ptri < nptr; ++ptri, ++seg) {
1654 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1655 		seg->mlds_byte_count = to_be32(0);
1656 		seg->mlds_address = to_be64(0);
1657 	}
1658 
1659 	/*
1660 	 * Make sure the workqueue entry is flushed out before updating
1661 	 * the doorbell.
1662 	 * If the ring has wrapped, we need to flush the front and back.
1663 	 */
1664 	if ((first + ents) > mlwq->mlwq_nents) {
1665 		uint_t sync_cnt = mlwq->mlwq_nents - first;
1666 
1667 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1668 		    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1669 		    sync_cnt * sizeof (mlxcx_sendq_ent_t),
1670 		    DDI_DMA_SYNC_FORDEV));
1671 
1672 		ent0 = &mlwq->mlwq_send_ent[0];
1673 		ents -= sync_cnt;
1674 	}
1675 
1676 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1677 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1678 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1679 	ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1680 	    DDI_FME_VERSION);
1681 	if (err.fme_status != DDI_FM_OK) {
1682 		return (B_FALSE);
1683 	}
1684 
1685 	/*
1686 	 * Hold the bufmtx whilst ringing the doorbell, to prevent
1687 	 * the buffer from being moved to another list, so we can
1688 	 * safely remove it should the ring fail.
1689 	 */
1690 	mutex_enter(&cq->mlcq_bufbmtx);
1691 
1692 	list_insert_tail(&cq->mlcq_buffers_b, b0);
1693 	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
1694 		atomic_inc_64(&cq->mlcq_bufcnt);
1695 	} else {
1696 		list_remove(&cq->mlcq_buffers_b, b0);
1697 	}
1698 
1699 	mutex_exit(&cq->mlcq_bufbmtx);
1700 
1701 	return (rv);
1702 }
1703 
1704 boolean_t
1705 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1706     mlxcx_buffer_t *buf)
1707 {
1708 	return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1709 }
1710 
1711 boolean_t
1712 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1713     mlxcx_buffer_t **bufs, size_t nbufs)
1714 {
1715 	uint_t index;
1716 	mlxcx_recvq_ent_t *ent;
1717 	mlxcx_completion_queue_t *cq;
1718 	mlxcx_wqe_data_seg_t *seg;
1719 	uint_t bi, ptri;
1720 	const ddi_dma_cookie_t *c;
1721 	mlxcx_buffer_t *buf;
1722 	ddi_fm_error_t err;
1723 
1724 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1725 	cq = mlwq->mlwq_cq;
1726 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1727 
1728 	for (bi = 0; bi < nbufs; ++bi) {
1729 		buf = bufs[bi];
1730 		bufs[bi] = NULL;
1731 		ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1732 
1733 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1734 		ent = &mlwq->mlwq_recv_ent[index];
1735 		buf->mlb_wqe_index = mlwq->mlwq_pc;
1736 		buf->mlb_wqebbs = 1;
1737 
1738 		++mlwq->mlwq_pc;
1739 		atomic_inc_64(&mlwq->mlwq_wqebb_used);
1740 
1741 		mutex_enter(&cq->mlcq_bufbmtx);
1742 		list_insert_tail(&cq->mlcq_buffers, buf);
1743 		atomic_inc_64(&cq->mlcq_bufcnt);
1744 		mutex_exit(&cq->mlcq_bufbmtx);
1745 
1746 		ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1747 		ptri = 0;
1748 		c = NULL;
1749 		while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1750 			seg = &ent->mlrqe_data[ptri++];
1751 			seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1752 			seg->mlds_byte_count = to_be32(c->dmac_size);
1753 			seg->mlds_address = to_be64(c->dmac_laddress);
1754 		}
1755 		/*
1756 		 * Fill any unused scatter pointers with the special null
1757 		 * value.
1758 		 */
1759 		for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1760 			seg = &ent->mlrqe_data[ptri];
1761 			seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1762 			seg->mlds_byte_count = to_be32(0);
1763 			seg->mlds_address = to_be64(0);
1764 		}
1765 
1766 		/*
1767 		 * Make sure the workqueue entry is flushed out before updating
1768 		 * the doorbell.
1769 		 */
1770 		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1771 		    (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1772 		    sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1773 		ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1774 		    DDI_FME_VERSION);
1775 		if (err.fme_status != DDI_FM_OK) {
1776 			return (B_FALSE);
1777 		}
1778 	}
1779 
1780 	mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1781 	/*
1782 	 * Flush the CQ doorbell as well so that HW knows how many
1783 	 * completions we've consumed.
1784 	 */
1785 	MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1786 	ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1787 	    DDI_FME_VERSION);
1788 	if (err.fme_status != DDI_FM_OK) {
1789 		return (B_FALSE);
1790 	}
1791 	MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1792 	ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1793 	    DDI_FME_VERSION);
1794 	if (err.fme_status != DDI_FM_OK) {
1795 		return (B_FALSE);
1796 	}
1797 	return (B_TRUE);
1798 }
1799 
1800 static void
1801 mlxcx_rq_refill_task(void *arg)
1802 {
1803 	mlxcx_work_queue_t *wq = arg;
1804 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1805 	mlxcx_t *mlxp = wq->mlwq_mlx;
1806 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
1807 	boolean_t refill, draining;
1808 
1809 	do {
1810 		/*
1811 		 * Wait here until one of 3 conditions:
1812 		 * 1. The shard is draining, or
1813 		 * 2. There are buffers on the free list, or
1814 		 * 3. The WQ is being shut down.
1815 		 */
1816 		mutex_enter(&s->mlbs_mtx);
1817 		while (s->mlbs_state != MLXCX_SHARD_DRAINING &&
1818 		    list_is_empty(&s->mlbs_free) &&
1819 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) {
1820 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1821 		}
1822 
1823 		draining = (s->mlbs_state == MLXCX_SHARD_DRAINING);
1824 		mutex_exit(&s->mlbs_mtx);
1825 
1826 		mutex_enter(&cq->mlcq_mtx);
1827 		mutex_enter(&wq->mlwq_mtx);
1828 
1829 		if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
1830 			refill = B_FALSE;
1831 			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1832 		} else {
1833 			mlxcx_rq_refill(mlxp, wq);
1834 
1835 			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
1836 				refill = B_TRUE;
1837 			} else {
1838 				refill = B_FALSE;
1839 				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1840 			}
1841 		}
1842 
1843 		mutex_exit(&wq->mlwq_mtx);
1844 		mutex_exit(&cq->mlcq_mtx);
1845 	} while (refill);
1846 }
1847 
1848 void
1849 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1850 {
1851 	size_t target, current, want, done, n;
1852 	mlxcx_completion_queue_t *cq;
1853 	mlxcx_ring_group_t *g;
1854 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1855 	uint_t i;
1856 
1857 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1858 	cq = mlwq->mlwq_cq;
1859 	ASSERT(mutex_owned(&cq->mlcq_mtx));
1860 
1861 	ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1862 
1863 	target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1864 	cq = mlwq->mlwq_cq;
1865 
1866 	if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0)
1867 		return;
1868 
1869 	if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0)
1870 		return;
1871 
1872 	current = cq->mlcq_bufcnt;
1873 
1874 	if (current >= target - MLXCX_RQ_REFILL_STEP)
1875 		return;
1876 
1877 	want = target - current;
1878 	done = 0;
1879 
1880 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1881 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1882 		if (n == 0) {
1883 			/*
1884 			 * We didn't get any buffers from the free queue.
1885 			 * It might not be an issue, schedule a taskq
1886 			 * to wait for free buffers if the completion
1887 			 * queue is low.
1888 			 */
1889 			if (current < MLXCX_RQ_REFILL_STEP &&
1890 			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
1891 				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
1892 				g = mlwq->mlwq_group;
1893 				taskq_dispatch_ent(g->mlg_refill_tq,
1894 				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
1895 				    &mlwq->mlwq_tqe);
1896 			}
1897 
1898 			return;
1899 		}
1900 
1901 		if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) {
1902 			for (i = 0; i < n; ++i)
1903 				mlxcx_buf_return(mlxp, b[i]);
1904 			return;
1905 		}
1906 		if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1907 			/*
1908 			 * mlxcx_rq_add_buffers NULLs out the buffers as it
1909 			 * enqueues them, so any that are non-NULL we have to
1910 			 * free now. The others now belong to the WQ, even if
1911 			 * we failed.
1912 			 */
1913 			for (i = 0; i < n; ++i) {
1914 				if (b[i] != NULL) {
1915 					mlxcx_buf_return(mlxp, b[i]);
1916 				}
1917 			}
1918 			return;
1919 		}
1920 		done += n;
1921 	}
1922 }
1923 
1924 static const char *
1925 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1926 {
1927 	switch (sy) {
1928 	case MLXCX_CQ_ERR_LOCAL_LENGTH:
1929 		return ("LOCAL_LENGTH");
1930 	case MLXCX_CQ_ERR_LOCAL_QP_OP:
1931 		return ("LOCAL_QP_OP");
1932 	case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1933 		return ("LOCAL_PROTECTION");
1934 	case MLXCX_CQ_ERR_WR_FLUSHED:
1935 		return ("WR_FLUSHED");
1936 	case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1937 		return ("MEM_WINDOW_BIND");
1938 	case MLXCX_CQ_ERR_BAD_RESPONSE:
1939 		return ("BAD_RESPONSE");
1940 	case MLXCX_CQ_ERR_LOCAL_ACCESS:
1941 		return ("LOCAL_ACCESS");
1942 	case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1943 		return ("XPORT_RETRY_CTR");
1944 	case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1945 		return ("RNR_RETRY_CTR");
1946 	case MLXCX_CQ_ERR_ABORTED:
1947 		return ("ABORTED");
1948 	default:
1949 		return ("UNKNOWN");
1950 	}
1951 }
1952 
1953 static void
1954 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1955     mlxcx_completionq_error_ent_t *ent)
1956 {
1957 	uint64_t ena;
1958 	char buf[FM_MAX_CLASS];
1959 	const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1960 
1961 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1962 		return;
1963 
1964 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1965 	    MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1966 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1967 
1968 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1969 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1970 	    "syndrome", DATA_TYPE_STRING, name,
1971 	    "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1972 	    "vendor_syndrome", DATA_TYPE_UINT8,
1973 	    ent->mlcqee_vendor_error_syndrome,
1974 	    "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1975 	    "wq_type", DATA_TYPE_STRING,
1976 	    (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1977 	    "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1978 	    "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1979 	    NULL);
1980 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1981 }
1982 
1983 void
1984 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1985     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1986 {
1987 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1988 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
1989 		mlxcx_completionq_error_ent_t *eent =
1990 		    (mlxcx_completionq_error_ent_t *)ent;
1991 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
1992 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
1993 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
1994 		mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
1995 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
1996 		return;
1997 	}
1998 
1999 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
2000 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2001 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2002 		return;
2003 	}
2004 
2005 	if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
2006 		mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
2007 		    ent->mlcqe_send_wqe_opcode);
2008 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2009 		return;
2010 	}
2011 
2012 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2013 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2014 		mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2015 		return;
2016 	}
2017 
2018 	mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2019 }
2020 
2021 mblk_t *
2022 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
2023     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
2024 {
2025 	uint32_t chkflags = 0;
2026 	uint_t wqe_index;
2027 	ddi_fm_error_t err;
2028 
2029 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
2030 
2031 	if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
2032 		mlxcx_completionq_error_ent_t *eent =
2033 		    (mlxcx_completionq_error_ent_t *)ent;
2034 		mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2035 		mlxcx_buf_return(mlxp, buf);
2036 		mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2037 		mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
2038 		mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2039 		return (NULL);
2040 	}
2041 
2042 	if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
2043 		mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2044 		mlxcx_buf_return(mlxp, buf);
2045 		return (NULL);
2046 	}
2047 
2048 	if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2049 		mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2050 		mlxcx_buf_return(mlxp, buf);
2051 		return (NULL);
2052 	}
2053 
2054 	if (ent->mlcqe_rx_drop_counter > 0) {
2055 		atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
2056 		    ent->mlcqe_rx_drop_counter);
2057 	}
2058 
2059 	MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
2060 	ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
2061 	    DDI_FME_VERSION);
2062 	if (err.fme_status != DDI_FM_OK) {
2063 		ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
2064 		    DDI_FME_VERSION);
2065 		mlxcx_buf_return(mlxp, buf);
2066 		return (NULL);
2067 	}
2068 
2069 	/*
2070 	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
2071 	 * Remember it for later.
2072 	 */
2073 	wqe_index = buf->mlb_wqe_index;
2074 
2075 	if (!mlxcx_buf_loan(mlxp, buf)) {
2076 		mlxcx_buf_return(mlxp, buf);
2077 		return (NULL);
2078 	}
2079 
2080 	buf->mlb_mp->b_next = NULL;
2081 	buf->mlb_mp->b_cont = NULL;
2082 	buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr +
2083 	    from_be32(ent->mlcqe_byte_cnt);
2084 
2085 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
2086 		chkflags |= HCK_FULLCKSUM_OK;
2087 	}
2088 	if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
2089 		chkflags |= HCK_IPV4_HDRCKSUM_OK;
2090 	}
2091 	if (chkflags != 0) {
2092 		mac_hcksum_set(buf->mlb_mp, 0, 0, 0,
2093 		    from_be16(ent->mlcqe_checksum), chkflags);
2094 	}
2095 
2096 	/*
2097 	 * Don't check if a refill is needed on every single completion,
2098 	 * since checking involves taking the RQ lock.
2099 	 */
2100 	if ((wqe_index & 0x7) == 0) {
2101 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
2102 		ASSERT(wq != NULL);
2103 		mutex_enter(&wq->mlwq_mtx);
2104 		if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
2105 			mlxcx_rq_refill(mlxp, wq);
2106 		mutex_exit(&wq->mlwq_mtx);
2107 	}
2108 
2109 	return (buf->mlb_mp);
2110 }
2111 
2112 static void
2113 mlxcx_buf_mp_return(caddr_t arg)
2114 {
2115 	mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
2116 	mlxcx_t *mlxp = b->mlb_mlx;
2117 
2118 	/* The mblk has been used now, so NULL it out. */
2119 	b->mlb_mp = NULL;
2120 
2121 	if (b->mlb_state == MLXCX_BUFFER_ON_LOAN)
2122 		mlxcx_buf_return(mlxp, b);
2123 }
2124 
2125 boolean_t
2126 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
2127 {
2128 	mlxcx_buffer_t *b;
2129 	ddi_device_acc_attr_t acc;
2130 	ddi_dma_attr_t attr;
2131 	boolean_t ret;
2132 
2133 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2134 	b->mlb_shard = shard;
2135 	b->mlb_foreign = B_FALSE;
2136 
2137 	mlxcx_dma_acc_attr(mlxp, &acc);
2138 	mlxcx_dma_buf_attr(mlxp, &attr);
2139 
2140 	ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
2141 	    B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
2142 	if (!ret) {
2143 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2144 		return (B_FALSE);
2145 	}
2146 
2147 	b->mlb_frtn.free_func = mlxcx_buf_mp_return;
2148 	b->mlb_frtn.free_arg = (caddr_t)b;
2149 	b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2150 	    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2151 
2152 	*bp = b;
2153 
2154 	return (B_TRUE);
2155 }
2156 
2157 boolean_t
2158 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
2159     mlxcx_buffer_t **bp)
2160 {
2161 	mlxcx_buffer_t *b;
2162 	ddi_dma_attr_t attr;
2163 	boolean_t ret;
2164 
2165 	b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2166 	b->mlb_shard = shard;
2167 	b->mlb_foreign = B_TRUE;
2168 
2169 	mlxcx_dma_buf_attr(mlxp, &attr);
2170 
2171 	ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
2172 	if (!ret) {
2173 		kmem_cache_free(mlxp->mlx_bufs_cache, b);
2174 		return (B_FALSE);
2175 	}
2176 
2177 	*bp = b;
2178 
2179 	return (B_TRUE);
2180 }
2181 
2182 static mlxcx_buffer_t *
2183 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2184 {
2185 	mlxcx_buffer_t *b;
2186 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
2187 
2188 	mutex_enter(&s->mlbs_mtx);
2189 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2190 		mutex_exit(&s->mlbs_mtx);
2191 		return (NULL);
2192 	}
2193 
2194 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2195 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2196 		ASSERT(b->mlb_foreign);
2197 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2198 		list_insert_tail(&s->mlbs_busy, b);
2199 	}
2200 	mutex_exit(&s->mlbs_mtx);
2201 
2202 	return (b);
2203 }
2204 
2205 static mlxcx_buffer_t *
2206 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
2207 {
2208 	ddi_fm_error_t err;
2209 	mlxcx_buffer_t *b;
2210 	uint_t attempts = 0;
2211 
2212 copyb:
2213 	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
2214 		return (NULL);
2215 
2216 	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2217 	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2218 
2219 	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2220 
2221 	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2222 	    DDI_FME_VERSION);
2223 	if (err.fme_status != DDI_FM_OK) {
2224 		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2225 		    DDI_FME_VERSION);
2226 		mlxcx_buf_return(mlxp, b);
2227 		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2228 			return (NULL);
2229 		}
2230 		goto copyb;
2231 	}
2232 
2233 	return (b);
2234 }
2235 
2236 static mlxcx_buffer_t *
2237 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2238     mblk_t *mp, size_t off)
2239 {
2240 	mlxcx_buffer_t *b;
2241 	uint8_t *rptr;
2242 	size_t sz;
2243 	boolean_t ret;
2244 
2245 	rptr = mp->b_rptr;
2246 	sz = MBLKL(mp);
2247 
2248 #ifdef DEBUG
2249 	if (off > 0) {
2250 		ASSERT3U(off, <, sz);
2251 	}
2252 #endif
2253 
2254 	rptr += off;
2255 	sz -= off;
2256 
2257 	if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
2258 		b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2259 	} else {
2260 		b = mlxcx_buf_take_foreign(mlxp, wq);
2261 		if (b == NULL)
2262 			return (NULL);
2263 
2264 		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
2265 		    B_FALSE);
2266 
2267 		if (!ret) {
2268 			mlxcx_buf_return(mlxp, b);
2269 
2270 			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2271 		}
2272 	}
2273 
2274 	return (b);
2275 }
2276 
2277 uint_t
2278 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2279     mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
2280 {
2281 	mlxcx_buffer_t *b, *b0 = NULL;
2282 	boolean_t first = B_TRUE;
2283 	mblk_t *mp;
2284 	size_t offset = off;
2285 	size_t ncookies = 0;
2286 	uint_t count = 0;
2287 
2288 	for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS;
2289 	    mp = mp->b_cont) {
2290 		b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset);
2291 		if (b == NULL)
2292 			goto failed;
2293 
2294 		ncookies += b->mlb_dma.mxdb_ncookies;
2295 
2296 		if (first)
2297 			b0 = b;
2298 
2299 		if (!first)
2300 			b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2301 
2302 		b->mlb_tx_mp = mp;
2303 		b->mlb_tx_head = b0;
2304 		b->mlb_used = MBLKL(mp) - offset;
2305 
2306 		if (!first)
2307 			list_insert_tail(&b0->mlb_tx_chain, b);
2308 		first = B_FALSE;
2309 		offset = 0;
2310 
2311 		count++;
2312 	}
2313 
2314 	/*
2315 	 * The chain of mblks has resulted in too many cookies for
2316 	 * a single message. This is unusual, so take the hit to tidy
2317 	 * up, do a pullup to a single mblk and allocate the requisite
2318 	 * buf.
2319 	 */
2320 	if (ncookies > MLXCX_SQE_MAX_PTRS) {
2321 		DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq,
2322 		    mblk_t *, mpb, size_t, ncookies);
2323 
2324 		if (b0 != NULL)
2325 			mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2326 
2327 		if ((mp = msgpullup(mpb, -1)) == NULL)
2328 			return (0);
2329 
2330 		b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off);
2331 		if (b0 == NULL) {
2332 			freemsg(mp);
2333 			return (0);
2334 		}
2335 		freemsg(mpb);
2336 
2337 		b0->mlb_tx_mp = mp;
2338 		b0->mlb_tx_head = b0;
2339 		b0->mlb_used = MBLKL(mp) - off;
2340 
2341 		count = 1;
2342 	}
2343 
2344 	*bp = b0;
2345 
2346 	return (count);
2347 
2348 failed:
2349 	if (b0 != NULL)
2350 		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2351 
2352 	return (0);
2353 }
2354 
2355 mlxcx_buffer_t *
2356 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2357 {
2358 	mlxcx_buffer_t *b;
2359 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2360 
2361 	mutex_enter(&s->mlbs_mtx);
2362 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2363 		mutex_exit(&s->mlbs_mtx);
2364 		return (NULL);
2365 	}
2366 
2367 	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2368 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2369 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2370 		list_insert_tail(&s->mlbs_busy, b);
2371 	}
2372 	mutex_exit(&s->mlbs_mtx);
2373 
2374 	return (b);
2375 }
2376 
2377 size_t
2378 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2379     mlxcx_buffer_t **bp, size_t nbufs)
2380 {
2381 	mlxcx_buffer_t *b;
2382 	size_t done = 0;
2383 	mlxcx_buf_shard_t *s;
2384 
2385 	s = wq->mlwq_bufs;
2386 
2387 	mutex_enter(&s->mlbs_mtx);
2388 	if (s->mlbs_state != MLXCX_SHARD_READY) {
2389 		mutex_exit(&s->mlbs_mtx);
2390 		return (0);
2391 	}
2392 
2393 	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
2394 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2395 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
2396 		list_insert_tail(&s->mlbs_busy, b);
2397 		bp[done++] = b;
2398 	}
2399 	mutex_exit(&s->mlbs_mtx);
2400 	return (done);
2401 }
2402 
2403 boolean_t
2404 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2405 {
2406 	mlxcx_buf_shard_t *s = b->mlb_shard;
2407 
2408 	VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2409 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2410 
2411 	if (b->mlb_mp == NULL) {
2412 		b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2413 		    b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2414 		if (b->mlb_mp == NULL)
2415 			return (B_FALSE);
2416 	}
2417 
2418 	b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2419 	b->mlb_wqe_index = 0;
2420 
2421 	mutex_enter(&s->mlbs_mtx);
2422 	list_remove(&s->mlbs_busy, b);
2423 	list_insert_tail(&s->mlbs_loaned, b);
2424 	mutex_exit(&s->mlbs_mtx);
2425 
2426 	return (B_TRUE);
2427 }
2428 
2429 void
2430 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2431 {
2432 	mlxcx_buffer_t *b;
2433 
2434 	if (b0->mlb_tx_head != b0) {
2435 		mlxcx_buf_return(mlxp, b0);
2436 		return;
2437 	}
2438 
2439 	while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2440 		mlxcx_buf_return(mlxp, b);
2441 	}
2442 	if (keepmp) {
2443 		b0->mlb_tx_mp = NULL;
2444 		b0->mlb_tx_head = NULL;
2445 	}
2446 	mlxcx_buf_return(mlxp, b0);
2447 }
2448 
2449 void
2450 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2451 {
2452 	mlxcx_buffer_state_t oldstate = b->mlb_state;
2453 	mlxcx_buffer_t *txhead = b->mlb_tx_head;
2454 	mlxcx_buf_shard_t *s = b->mlb_shard;
2455 	mblk_t *mp = b->mlb_tx_mp;
2456 
2457 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2458 	ASSERT3P(b->mlb_mlx, ==, mlxp);
2459 
2460 	/*
2461 	 * The mlbs_mtx held below is a heavily contended lock, so it is
2462 	 * imperative we do as much of the buffer clean up outside the lock
2463 	 * as is possible.
2464 	 */
2465 	b->mlb_state = MLXCX_BUFFER_FREE;
2466 	b->mlb_wqe_index = 0;
2467 	b->mlb_tx_head = NULL;
2468 	b->mlb_tx_mp = NULL;
2469 	b->mlb_used = 0;
2470 	b->mlb_wqebbs = 0;
2471 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2472 
2473 	if (b->mlb_foreign) {
2474 		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2475 			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2476 		}
2477 	}
2478 
2479 	mutex_enter(&s->mlbs_mtx);
2480 	switch (oldstate) {
2481 	case MLXCX_BUFFER_INIT:
2482 		break;
2483 	case MLXCX_BUFFER_ON_WQ:
2484 		list_remove(&s->mlbs_busy, b);
2485 		break;
2486 	case MLXCX_BUFFER_ON_LOAN:
2487 		ASSERT(!b->mlb_foreign);
2488 		list_remove(&s->mlbs_loaned, b);
2489 		if (s->mlbs_state == MLXCX_SHARD_DRAINING) {
2490 			/*
2491 			 * When we're draining, Eg during mac_stop(),
2492 			 * we destroy the buffer immediately rather than
2493 			 * recycling it. Otherwise we risk leaving it
2494 			 * on the free list and leaking it.
2495 			 */
2496 			list_insert_tail(&s->mlbs_free, b);
2497 			mlxcx_buf_destroy(mlxp, b);
2498 			/*
2499 			 * Teardown might be waiting for loaned list to empty.
2500 			 */
2501 			cv_broadcast(&s->mlbs_free_nonempty);
2502 			mutex_exit(&s->mlbs_mtx);
2503 			return;
2504 		}
2505 		break;
2506 	case MLXCX_BUFFER_FREE:
2507 		VERIFY(0);
2508 		break;
2509 	case MLXCX_BUFFER_ON_CHAIN:
2510 		ASSERT(txhead != NULL);
2511 		list_remove(&txhead->mlb_tx_chain, b);
2512 		list_remove(&s->mlbs_busy, b);
2513 		break;
2514 	}
2515 
2516 	list_insert_tail(&s->mlbs_free, b);
2517 	cv_broadcast(&s->mlbs_free_nonempty);
2518 
2519 	mutex_exit(&s->mlbs_mtx);
2520 
2521 	/*
2522 	 * For TX chain heads, free the mblk_t after we let go of the lock.
2523 	 * This might be a borrowed buf that we in turn loaned to MAC, in which
2524 	 * case calling freemsg() on it will re-enter this very function -- so
2525 	 * we better not be holding the lock!
2526 	 */
2527 	if (txhead == b)
2528 		freemsg(mp);
2529 }
2530 
2531 void
2532 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2533 {
2534 	mlxcx_buf_shard_t *s = b->mlb_shard;
2535 
2536 	VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2537 	    b->mlb_state == MLXCX_BUFFER_INIT);
2538 	ASSERT(mutex_owned(&s->mlbs_mtx));
2539 
2540 	if (b->mlb_state == MLXCX_BUFFER_FREE)
2541 		list_remove(&s->mlbs_free, b);
2542 
2543 	/*
2544 	 * This is going back to the kmem cache, so it needs to be set up in
2545 	 * the same way we expect a new buffer to come out (state INIT, other
2546 	 * fields NULL'd)
2547 	 */
2548 	b->mlb_state = MLXCX_BUFFER_INIT;
2549 	b->mlb_shard = NULL;
2550 	if (b->mlb_mp != NULL) {
2551 		freeb(b->mlb_mp);
2552 		ASSERT(b->mlb_mp == NULL);
2553 	}
2554 	mlxcx_dma_free(&b->mlb_dma);
2555 	ASSERT(list_is_empty(&b->mlb_tx_chain));
2556 
2557 	kmem_cache_free(mlxp->mlx_bufs_cache, b);
2558 }
2559 
2560 void
2561 mlxcx_shard_ready(mlxcx_buf_shard_t *s)
2562 {
2563 	mutex_enter(&s->mlbs_mtx);
2564 	s->mlbs_state = MLXCX_SHARD_READY;
2565 	mutex_exit(&s->mlbs_mtx);
2566 }
2567 
2568 void
2569 mlxcx_shard_draining(mlxcx_buf_shard_t *s)
2570 {
2571 	mutex_enter(&s->mlbs_mtx);
2572 	s->mlbs_state = MLXCX_SHARD_DRAINING;
2573 	cv_broadcast(&s->mlbs_free_nonempty);
2574 	mutex_exit(&s->mlbs_mtx);
2575 }
2576