1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2023 The University of Queensland
14 * Copyright (c) 2018, Joyent, Inc.
15 * Copyright 2020 RackTop Systems, Inc.
16 */
17
18 /*
19 * Mellanox Connect-X 4/5/6 driver.
20 */
21
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 #include <sys/sdt.h>
29
30 #include <sys/pattr.h>
31 #include <sys/dlpi.h>
32
33 #include <sys/mac_provider.h>
34
35 #include <sys/random.h>
36
37 #include <mlxcx.h>
38
39 boolean_t
mlxcx_wq_alloc_dma(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq)40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
41 {
42 ddi_device_acc_attr_t acc;
43 ddi_dma_attr_t attr;
44 boolean_t ret;
45 size_t sz;
46
47 VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
48
49 /* Receive and send queue entries might be different sizes. */
50 switch (mlwq->mlwq_type) {
51 case MLXCX_WQ_TYPE_SENDQ:
52 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift;
53 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
54 sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t);
55 break;
56 case MLXCX_WQ_TYPE_RECVQ:
57 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift;
58 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift);
59 sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t);
60 break;
61 default:
62 VERIFY(0);
63 return (B_FALSE);
64 }
65 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
66
67 mlxcx_dma_acc_attr(mlxp, &acc);
68 mlxcx_dma_queue_attr(mlxp, &attr);
69
70 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc,
71 B_TRUE, sz, B_TRUE);
72 if (!ret) {
73 mlxcx_warn(mlxp, "failed to allocate WQ memory");
74 return (B_FALSE);
75 }
76
77 /*
78 * Just set the first pointer in the union. Yes, this is a strict
79 * aliasing violation. No, I don't care.
80 */
81 mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va;
82
83 mlxcx_dma_acc_attr(mlxp, &acc);
84 mlxcx_dma_qdbell_attr(mlxp, &attr);
85 sz = sizeof (mlxcx_workq_doorbell_t);
86 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc,
87 B_TRUE, sz, B_TRUE);
88 if (!ret) {
89 mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory");
90 mlxcx_dma_free(&mlwq->mlwq_dma);
91 mlwq->mlwq_send_ent = NULL;
92 return (B_FALSE);
93 }
94
95 mlwq->mlwq_doorbell =
96 (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va;
97
98 mlwq->mlwq_state |= MLXCX_WQ_ALLOC;
99
100 return (B_TRUE);
101 }
102
103 void
mlxcx_wq_rele_dma(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq)104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
105 {
106 VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC);
107 if (mlwq->mlwq_state & MLXCX_WQ_CREATED)
108 VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED);
109
110 mlxcx_dma_free(&mlwq->mlwq_dma);
111 mlwq->mlwq_send_ent = NULL;
112 mlxcx_dma_free(&mlwq->mlwq_doorbell_dma);
113 mlwq->mlwq_doorbell = NULL;
114
115 mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
116 }
117
118 static boolean_t
mlxcx_cq_alloc_dma(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,uint_t ent_shift)119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
120 uint_t ent_shift)
121 {
122 ddi_device_acc_attr_t acc;
123 ddi_dma_attr_t attr;
124 boolean_t ret;
125 size_t sz, i;
126
127 VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
128
129 mlcq->mlcq_entshift = ent_shift;
130 mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
131 sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
132 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
133
134 mlxcx_dma_acc_attr(mlxp, &acc);
135 mlxcx_dma_queue_attr(mlxp, &attr);
136
137 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc,
138 B_TRUE, sz, B_TRUE);
139 if (!ret) {
140 mlxcx_warn(mlxp, "failed to allocate CQ memory");
141 return (B_FALSE);
142 }
143
144 mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va;
145
146 for (i = 0; i < mlcq->mlcq_nents; ++i) {
147 mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID;
148 mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT;
149 }
150
151 mlxcx_dma_acc_attr(mlxp, &acc);
152 mlxcx_dma_qdbell_attr(mlxp, &attr);
153 sz = sizeof (mlxcx_completionq_doorbell_t);
154 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc,
155 B_TRUE, sz, B_TRUE);
156 if (!ret) {
157 mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory");
158 mlxcx_dma_free(&mlcq->mlcq_dma);
159 mlcq->mlcq_ent = NULL;
160 return (B_FALSE);
161 }
162
163 mlcq->mlcq_doorbell =
164 (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va;
165
166 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC);
167
168 return (B_TRUE);
169 }
170
171 static void
mlxcx_cq_rele_dma(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq)172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
173 {
174 VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
175 if (mlcq->mlcq_state & MLXCX_CQ_CREATED)
176 VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177
178 mlxcx_dma_free(&mlcq->mlcq_dma);
179 mlcq->mlcq_ent = NULL;
180 mlxcx_dma_free(&mlcq->mlcq_doorbell_dma);
181 mlcq->mlcq_doorbell = NULL;
182
183 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC);
184 }
185
186 void
mlxcx_wq_teardown(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq)187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
188 {
189 mlxcx_completion_queue_t *mlcq;
190
191 /*
192 * If something is holding the lock on a long operation like a
193 * refill, setting this flag asks them to exit early if possible.
194 */
195 atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN);
196
197 mutex_enter(&mlwq->mlwq_mtx);
198
199 list_remove(&mlxp->mlx_wqs, mlwq);
200
201 if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) &&
202 !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) {
203 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
204 mlwq->mlwq_state & MLXCX_WQ_STARTED &&
205 !mlxcx_cmd_stop_rq(mlxp, mlwq)) {
206 mlxcx_warn(mlxp, "failed to stop "
207 "recv queue num %x", mlwq->mlwq_num);
208 }
209 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
210 mlwq->mlwq_state & MLXCX_WQ_STARTED &&
211 !mlxcx_cmd_stop_sq(mlxp, mlwq)) {
212 mlxcx_warn(mlxp, "failed to stop "
213 "send queue num %x", mlwq->mlwq_num);
214 }
215 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
216 !mlxcx_cmd_destroy_rq(mlxp, mlwq)) {
217 mlxcx_warn(mlxp, "failed to destroy "
218 "recv queue num %x", mlwq->mlwq_num);
219 }
220 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
221 !mlxcx_cmd_destroy_sq(mlxp, mlwq)) {
222 mlxcx_warn(mlxp, "failed to destroy "
223 "send queue num %x", mlwq->mlwq_num);
224 }
225 }
226 if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) {
227 mlxcx_wq_rele_dma(mlxp, mlwq);
228 }
229 mlcq = mlwq->mlwq_cq;
230
231 /* These will be released by mlxcx_teardown_bufs() */
232 mlwq->mlwq_bufs = NULL;
233 mlwq->mlwq_foreign_bufs = NULL;
234
235 mutex_exit(&mlwq->mlwq_mtx);
236
237 mutex_enter(&mlcq->mlcq_mtx);
238 mutex_enter(&mlwq->mlwq_mtx);
239 ASSERT3P(mlcq->mlcq_wq, ==, mlwq);
240 mlcq->mlcq_wq = NULL;
241 mutex_exit(&mlwq->mlwq_mtx);
242 mutex_exit(&mlcq->mlcq_mtx);
243
244 mutex_destroy(&mlwq->mlwq_mtx);
245 }
246
247 void
mlxcx_cq_teardown(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq)248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
249 {
250 mlxcx_event_queue_t *mleq;
251 mlxcx_buffer_t *b;
252
253 /*
254 * If something is holding the lock on a long operation like polling
255 * which we're going to abort anyway, this flag asks them to exit
256 * early if possible.
257 */
258 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN);
259
260 mutex_enter(&mlcq->mlcq_mtx);
261
262 list_remove(&mlxp->mlx_cqs, mlcq);
263
264 if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) &&
265 !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) {
266 if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) {
267 mlxcx_warn(mlxp, "failed to destroy "
268 "completion queue num %u",
269 mlcq->mlcq_num);
270 }
271 }
272 if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) {
273 mlxcx_cq_rele_dma(mlxp, mlcq);
274 }
275 /*
276 * If we're on an EQ AVL tree, then we need to grab
277 * the EQ's mutex to take it off. The ISR always takes
278 * EQ mutex before CQ mutex, so we have to let go of
279 * the CQ mutex then come back again.
280 *
281 * The ISR will bail out if tries to touch this CQ now since
282 * we added the CQ_DESTROYED flag above.
283 */
284 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
285 mleq = mlcq->mlcq_eq;
286 } else {
287 mleq = NULL;
288 }
289
290 /* Return any outstanding buffers to the free pool. */
291 while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) {
292 mlxcx_buf_return_chain(mlxp, b, B_FALSE);
293 }
294 mutex_enter(&mlcq->mlcq_bufbmtx);
295 while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) {
296 mlxcx_buf_return_chain(mlxp, b, B_FALSE);
297 }
298 mutex_exit(&mlcq->mlcq_bufbmtx);
299
300 /*
301 * Since the interrupt handlers take the EQ lock before the CQ one,
302 * we must do the same here. That means letting go of the lock
303 * for a brief window here (we'll double-check the state when we
304 * get back in).
305 */
306 mutex_exit(&mlcq->mlcq_mtx);
307
308 if (mleq != NULL) {
309 mutex_enter(&mleq->mleq_mtx);
310 mutex_enter(&mlcq->mlcq_mtx);
311 /*
312 * Double-check the state, we let go of the
313 * mutex briefly.
314 */
315 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) {
316 avl_remove(&mleq->mleq_cqs, mlcq);
317 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL);
318 }
319 mutex_exit(&mlcq->mlcq_mtx);
320 mutex_exit(&mleq->mleq_mtx);
321 }
322
323 mutex_enter(&mlcq->mlcq_mtx);
324 ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED |
325 MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED));
326 mutex_exit(&mlcq->mlcq_mtx);
327
328 mutex_destroy(&mlcq->mlcq_mtx);
329 mutex_destroy(&mlcq->mlcq_arm_mtx);
330 mutex_destroy(&mlcq->mlcq_bufbmtx);
331 list_destroy(&mlcq->mlcq_buffers);
332 list_destroy(&mlcq->mlcq_buffers_b);
333 kmem_free(mlcq, sizeof (mlxcx_completion_queue_t));
334 }
335
336 static boolean_t
mlxcx_cq_setup(mlxcx_t * mlxp,mlxcx_event_queue_t * eq,mlxcx_completion_queue_t ** cqp,uint_t ent_shift)337 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
338 mlxcx_completion_queue_t **cqp, uint_t ent_shift)
339 {
340 mlxcx_completion_queue_t *cq;
341
342 cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP);
343 mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER,
344 DDI_INTR_PRI(mlxp->mlx_intr_pri));
345 mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER,
346 DDI_INTR_PRI(mlxp->mlx_intr_pri));
347 mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER,
348 DDI_INTR_PRI(mlxp->mlx_intr_pri));
349 list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t),
350 offsetof(mlxcx_buffer_t, mlb_cq_entry));
351 list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t),
352 offsetof(mlxcx_buffer_t, mlb_cq_entry));
353
354 cq->mlcq_mlx = mlxp;
355 list_insert_tail(&mlxp->mlx_cqs, cq);
356
357 mutex_enter(&cq->mlcq_mtx);
358
359 if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
360 mutex_exit(&cq->mlcq_mtx);
361 return (B_FALSE);
362 }
363
364 cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP;
365 cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP;
366
367 cq->mlcq_uar = &mlxp->mlx_uar;
368 cq->mlcq_eq = eq;
369
370 cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec;
371 cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count;
372
373 if (!mlxcx_cmd_create_cq(mlxp, cq)) {
374 mutex_exit(&cq->mlcq_mtx);
375 return (B_FALSE);
376 }
377
378 mutex_exit(&cq->mlcq_mtx);
379
380 mutex_enter(&eq->mleq_mtx);
381 mutex_enter(&cq->mlcq_arm_mtx);
382 mutex_enter(&cq->mlcq_mtx);
383 ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL);
384 avl_add(&eq->mleq_cqs, cq);
385 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL);
386 mlxcx_arm_cq(mlxp, cq);
387 mutex_exit(&cq->mlcq_mtx);
388 mutex_exit(&cq->mlcq_arm_mtx);
389 mutex_exit(&eq->mleq_mtx);
390
391 *cqp = cq;
392 return (B_TRUE);
393 }
394
395 static boolean_t
mlxcx_rq_setup(mlxcx_t * mlxp,mlxcx_completion_queue_t * cq,mlxcx_work_queue_t * wq)396 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
397 mlxcx_work_queue_t *wq)
398 {
399 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
400 DDI_INTR_PRI(mlxp->mlx_intr_pri));
401
402 list_insert_tail(&mlxp->mlx_wqs, wq);
403
404 mutex_enter(&wq->mlwq_mtx);
405
406 wq->mlwq_mlx = mlxp;
407 wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ;
408 wq->mlwq_cq = cq;
409 wq->mlwq_pd = &mlxp->mlx_pd;
410 wq->mlwq_uar = &mlxp->mlx_uar;
411
412 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
413
414 if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
415 mutex_exit(&wq->mlwq_mtx);
416 return (B_FALSE);
417 }
418
419 if (!mlxcx_cmd_create_rq(mlxp, wq)) {
420 mutex_exit(&wq->mlwq_mtx);
421 return (B_FALSE);
422 }
423
424 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
425 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
426
427 mutex_exit(&wq->mlwq_mtx);
428
429 mutex_enter(&cq->mlcq_mtx);
430 mutex_enter(&wq->mlwq_mtx);
431 ASSERT3P(cq->mlcq_wq, ==, NULL);
432 cq->mlcq_wq = wq;
433 mutex_exit(&wq->mlwq_mtx);
434 mutex_exit(&cq->mlcq_mtx);
435
436 return (B_TRUE);
437 }
438
439 static boolean_t
mlxcx_sq_setup(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_completion_queue_t * cq,mlxcx_tis_t * tis,mlxcx_work_queue_t * wq)440 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
441 mlxcx_tis_t *tis, mlxcx_work_queue_t *wq)
442 {
443 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER,
444 DDI_INTR_PRI(mlxp->mlx_intr_pri));
445
446 list_insert_tail(&mlxp->mlx_wqs, wq);
447
448 mutex_enter(&wq->mlwq_mtx);
449
450 wq->mlwq_mlx = mlxp;
451 wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ;
452 wq->mlwq_cq = cq;
453 wq->mlwq_pd = &mlxp->mlx_pd;
454 wq->mlwq_uar = &mlxp->mlx_uar;
455 wq->mlwq_tis = tis;
456
457 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp);
458 wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp);
459
460 VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2);
461 wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2;
462
463 if (!mlxcx_wq_alloc_dma(mlxp, wq)) {
464 mutex_exit(&wq->mlwq_mtx);
465 return (B_FALSE);
466 }
467
468 if (!mlxcx_cmd_create_sq(mlxp, wq)) {
469 mutex_exit(&wq->mlwq_mtx);
470 return (B_FALSE);
471 }
472
473 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
474 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
475
476 mutex_exit(&wq->mlwq_mtx);
477
478 mutex_enter(&cq->mlcq_mtx);
479 mutex_enter(&wq->mlwq_mtx);
480 ASSERT3P(cq->mlcq_wq, ==, NULL);
481 cq->mlcq_wq = wq;
482 mutex_exit(&wq->mlwq_mtx);
483 mutex_exit(&cq->mlcq_mtx);
484
485 return (B_TRUE);
486 }
487
488 /*
489 * Before we tear down the queues associated with the rx group,
490 * flag each cq as being torn down and wake up any tasks.
491 */
492 static void
mlxcx_quiesce_rx_cqs(mlxcx_t * mlxp,mlxcx_ring_group_t * g)493 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
494 {
495 mlxcx_work_queue_t *wq;
496 mlxcx_completion_queue_t *cq;
497 mlxcx_buf_shard_t *s;
498 uint_t i;
499
500 mutex_enter(&g->mlg_mtx);
501
502 for (i = 0; i < g->mlg_nwqs; ++i) {
503 wq = &g->mlg_wqs[i];
504 cq = wq->mlwq_cq;
505 if (cq != NULL) {
506 s = wq->mlwq_bufs;
507 mutex_enter(&s->mlbs_mtx);
508 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
509 cv_broadcast(&s->mlbs_free_nonempty);
510 mutex_exit(&s->mlbs_mtx);
511 }
512 }
513
514 mutex_exit(&g->mlg_mtx);
515 }
516
517 void
mlxcx_teardown_rx_group(mlxcx_t * mlxp,mlxcx_ring_group_t * g)518 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
519 {
520 mlxcx_work_queue_t *wq;
521 mlxcx_completion_queue_t *cq;
522 mlxcx_flow_entry_t *fe;
523 mlxcx_flow_group_t *fg;
524 mlxcx_flow_table_t *ft;
525 uint_t i;
526
527 mutex_enter(&g->mlg_port->mlp_mtx);
528 mutex_enter(&g->mlg_mtx);
529
530 if (g->mlg_state & MLXCX_GROUP_FLOWS) {
531 mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g);
532
533 if (g->mlg_rx_vlan_ft != NULL)
534 mlxcx_remove_all_vlan_entries(mlxp, g);
535
536 if (g == &mlxp->mlx_rx_groups[0]) {
537 ft = g->mlg_port->mlp_rx_flow;
538 mutex_enter(&ft->mlft_mtx);
539
540 fg = g->mlg_port->mlp_bcast;
541 fe = list_head(&fg->mlfg_entries);
542 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
543 (void) mlxcx_cmd_delete_flow_table_entry(
544 mlxp, fe);
545 }
546
547 fg = g->mlg_port->mlp_promisc;
548 fe = list_head(&fg->mlfg_entries);
549 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
550 (void) mlxcx_cmd_delete_flow_table_entry(
551 mlxp, fe);
552 }
553
554 mutex_exit(&ft->mlft_mtx);
555 }
556
557 if (g->mlg_rx_vlan_ft != NULL) {
558 mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx);
559 ASSERT(list_is_empty(&g->mlg_rx_vlans));
560 fg = g->mlg_rx_vlan_def_fg;
561 if (fg != NULL) {
562 fe = list_head(&fg->mlfg_entries);
563 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
564 (void)
565 mlxcx_cmd_delete_flow_table_entry(
566 mlxp, fe);
567 }
568 }
569 fg = g->mlg_rx_vlan_promisc_fg;
570 if (fg != NULL) {
571 fe = list_head(&fg->mlfg_entries);
572 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
573 (void)
574 mlxcx_cmd_delete_flow_table_entry(
575 mlxp, fe);
576 }
577 }
578 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft);
579 list_destroy(&g->mlg_rx_vlans);
580
581 g->mlg_rx_vlan_ft = NULL;
582 }
583
584 mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx);
585 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft);
586 g->mlg_rx_hash_ft = NULL;
587
588 avl_destroy(&g->mlg_rx_macs);
589 g->mlg_state &= ~MLXCX_GROUP_FLOWS;
590 }
591
592 if (g->mlg_state & MLXCX_GROUP_RUNNING) {
593 for (i = 0; i < g->mlg_nwqs; ++i) {
594 wq = &g->mlg_wqs[i];
595 mutex_enter(&wq->mlwq_mtx);
596 if (wq->mlwq_state & MLXCX_WQ_STARTED &&
597 !mlxcx_cmd_stop_rq(mlxp, wq)) {
598 mlxcx_warn(mlxp, "failed to stop rq %x",
599 wq->mlwq_num);
600 }
601 mutex_exit(&wq->mlwq_mtx);
602 }
603 taskq_destroy(g->mlg_refill_tq);
604 g->mlg_state &= ~MLXCX_GROUP_RUNNING;
605 }
606
607 if (g->mlg_state & MLXCX_GROUP_TIRTIS) {
608 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
609 mlxcx_tir_t *tir = &g->mlg_tir[i];
610 if (tir->mltir_state & MLXCX_TIR_CREATED &&
611 !(tir->mltir_state & MLXCX_TIR_DESTROYED)) {
612 if (!mlxcx_cmd_destroy_tir(mlxp, tir)) {
613 mlxcx_warn(mlxp,
614 "failed to destroy tir %u "
615 "for rx ring", tir->mltir_num);
616 }
617 }
618 }
619 g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
620 }
621
622 if (g->mlg_state & MLXCX_GROUP_RQT) {
623 if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED &&
624 !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) {
625 if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) {
626 mlxcx_warn(mlxp, "failed to destroy rqt %u "
627 "for rx ring", g->mlg_rqt->mlrqt_num);
628 }
629 kmem_free(g->mlg_rqt->mlrqt_rq,
630 g->mlg_rqt->mlrqt_rq_size);
631 g->mlg_rqt->mlrqt_rq = NULL;
632 kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t));
633 g->mlg_rqt = NULL;
634 }
635 g->mlg_state &= ~MLXCX_GROUP_RQT;
636 }
637
638 for (i = 0; i < g->mlg_nwqs; ++i) {
639 wq = &g->mlg_wqs[i];
640 cq = wq->mlwq_cq;
641 mlxcx_wq_teardown(mlxp, wq);
642 if (cq != NULL)
643 mlxcx_cq_teardown(mlxp, cq);
644 }
645 kmem_free(g->mlg_wqs, g->mlg_wqs_size);
646 g->mlg_wqs = NULL;
647 g->mlg_state &= ~MLXCX_GROUP_WQS;
648
649 mutex_exit(&g->mlg_mtx);
650 mutex_exit(&g->mlg_port->mlp_mtx);
651
652 mutex_destroy(&g->mlg_mtx);
653
654 g->mlg_state &= ~MLXCX_GROUP_INIT;
655 ASSERT3S(g->mlg_state, ==, 0);
656 }
657
658 void
mlxcx_teardown_tx_group(mlxcx_t * mlxp,mlxcx_ring_group_t * g)659 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
660 {
661 mlxcx_work_queue_t *wq;
662 mlxcx_completion_queue_t *cq;
663 uint_t i;
664
665 mutex_enter(&g->mlg_mtx);
666
667 if (g->mlg_state & MLXCX_GROUP_WQS) {
668 for (i = 0; i < g->mlg_nwqs; ++i) {
669 wq = &g->mlg_wqs[i];
670 mutex_enter(&wq->mlwq_mtx);
671 cq = wq->mlwq_cq;
672 if (wq->mlwq_state & MLXCX_WQ_STARTED &&
673 !mlxcx_cmd_stop_sq(mlxp, wq)) {
674 mlxcx_warn(mlxp, "failed to stop sq %x",
675 wq->mlwq_num);
676 }
677 mutex_exit(&wq->mlwq_mtx);
678 mlxcx_wq_teardown(mlxp, wq);
679 if (cq != NULL)
680 mlxcx_cq_teardown(mlxp, cq);
681 }
682 g->mlg_state &= ~MLXCX_GROUP_RUNNING;
683 kmem_free(g->mlg_wqs, g->mlg_wqs_size);
684 g->mlg_wqs = NULL;
685 g->mlg_state &= ~MLXCX_GROUP_WQS;
686 }
687
688 if ((g->mlg_state & MLXCX_GROUP_TIRTIS) &&
689 g->mlg_tis.mltis_state & MLXCX_TIS_CREATED &&
690 !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) {
691 if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) {
692 mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring",
693 g->mlg_tis.mltis_num);
694 }
695 }
696 g->mlg_state &= ~MLXCX_GROUP_TIRTIS;
697
698 mutex_exit(&g->mlg_mtx);
699 mutex_destroy(&g->mlg_mtx);
700 g->mlg_state &= ~MLXCX_GROUP_INIT;
701 ASSERT3S(g->mlg_state, ==, 0);
702 }
703
704 void
mlxcx_teardown_groups(mlxcx_t * mlxp)705 mlxcx_teardown_groups(mlxcx_t *mlxp)
706 {
707 mlxcx_ring_group_t *g;
708 uint_t i;
709
710 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
711 g = &mlxp->mlx_rx_groups[i];
712 if (!(g->mlg_state & MLXCX_GROUP_INIT))
713 continue;
714 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
715 mlxcx_quiesce_rx_cqs(mlxp, g);
716 }
717
718 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
719 g = &mlxp->mlx_rx_groups[i];
720 if (!(g->mlg_state & MLXCX_GROUP_INIT))
721 continue;
722 mlxcx_teardown_rx_group(mlxp, g);
723 }
724
725 kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
726 mlxp->mlx_rx_groups = NULL;
727
728 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
729 g = &mlxp->mlx_tx_groups[i];
730 if (!(g->mlg_state & MLXCX_GROUP_INIT))
731 continue;
732 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
733 mlxcx_teardown_tx_group(mlxp, g);
734 }
735
736 kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
737 mlxp->mlx_tx_groups = NULL;
738 }
739
740 boolean_t
mlxcx_rx_group_setup(mlxcx_t * mlxp,mlxcx_ring_group_t * g)741 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
742 {
743 mlxcx_event_queue_t *eq;
744 mlxcx_completion_queue_t *cq;
745 mlxcx_work_queue_t *rq;
746 mlxcx_flow_table_t *ft;
747 mlxcx_flow_group_t *fg;
748 mlxcx_flow_entry_t *fe;
749 uint_t ent_shift;
750 uint_t i, j;
751
752 ASSERT3S(g->mlg_state, ==, 0);
753
754 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
755 DDI_INTR_PRI(mlxp->mlx_intr_pri));
756 mutex_enter(&g->mlg_mtx);
757 g->mlg_mlx = mlxp;
758 g->mlg_type = MLXCX_GROUP_RX;
759 g->mlg_port = &mlxp->mlx_ports[0];
760 g->mlg_state |= MLXCX_GROUP_INIT;
761
762 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group;
763 i = g - &mlxp->mlx_rx_groups[0];
764 if (i < mlxp->mlx_props.mldp_rx_ngroups_large)
765 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group;
766
767 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
768 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
769 g->mlg_state |= MLXCX_GROUP_WQS;
770
771 g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP);
772 g->mlg_rqt->mlrqt_max = 2;
773 while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs)
774 g->mlg_rqt->mlrqt_max <<= 1;
775 g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max *
776 sizeof (mlxcx_work_queue_t *);
777 g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP);
778 g->mlg_state |= MLXCX_GROUP_RQT;
779
780 for (i = 0; i < g->mlg_nwqs; ++i) {
781 eq = NULL;
782 while (eq == NULL) {
783 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
784 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
785 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
786 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
787 eq->mleq_type != MLXCX_EQ_TYPE_RX) {
788 /* Try the next one */
789 eq = NULL;
790 }
791 }
792
793 /*
794 * A single completion is indicated for each rq entry as
795 * it is used. So, the number of cq entries never needs
796 * to be larger than the rq.
797 */
798 ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
799 mlxp->mlx_props.mldp_rq_size_shift);
800 if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
801 g->mlg_nwqs = i;
802 break;
803 }
804
805 cq->mlcq_stats = &g->mlg_port->mlp_stats;
806
807 rq = &g->mlg_wqs[i];
808 if (!mlxcx_rq_setup(mlxp, cq, rq)) {
809 g->mlg_nwqs = i;
810 break;
811 }
812 g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq;
813 g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY;
814 rq->mlwq_group = g;
815 }
816 if (g->mlg_nwqs == 0) {
817 mutex_exit(&g->mlg_mtx);
818 return (B_FALSE);
819 }
820
821 if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) {
822 mutex_exit(&g->mlg_mtx);
823 return (B_FALSE);
824 }
825
826 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) {
827 mlxcx_tir_t *tir = &g->mlg_tir[i];
828 tir->mltir_tdom = &mlxp->mlx_tdom;
829 switch (i) {
830 case MLXCX_TIR_ROLE_OTHER:
831 tir->mltir_type = MLXCX_TIR_DIRECT;
832 tir->mltir_rq = &g->mlg_wqs[0];
833 break;
834 case MLXCX_TIR_ROLE_IPv4:
835 case MLXCX_TIR_ROLE_IPv6:
836 case MLXCX_TIR_ROLE_TCPv4:
837 case MLXCX_TIR_ROLE_TCPv6:
838 case MLXCX_TIR_ROLE_UDPv4:
839 case MLXCX_TIR_ROLE_UDPv6:
840 tir->mltir_type = MLXCX_TIR_INDIRECT;
841 tir->mltir_rqtable = g->mlg_rqt;
842 tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ;
843 (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key,
844 sizeof (tir->mltir_toeplitz_key));
845 break;
846 }
847 switch (i) {
848 case MLXCX_TIR_ROLE_OTHER:
849 break;
850 case MLXCX_TIR_ROLE_IPv4:
851 case MLXCX_TIR_ROLE_TCPv4:
852 case MLXCX_TIR_ROLE_UDPv4:
853 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4;
854 tir->mltir_hash_fields =
855 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
856 break;
857 case MLXCX_TIR_ROLE_IPv6:
858 case MLXCX_TIR_ROLE_TCPv6:
859 case MLXCX_TIR_ROLE_UDPv6:
860 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6;
861 tir->mltir_hash_fields =
862 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP;
863 break;
864 }
865 switch (i) {
866 case MLXCX_TIR_ROLE_OTHER:
867 case MLXCX_TIR_ROLE_IPv4:
868 case MLXCX_TIR_ROLE_IPv6:
869 break;
870 case MLXCX_TIR_ROLE_TCPv4:
871 case MLXCX_TIR_ROLE_TCPv6:
872 tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP;
873 tir->mltir_hash_fields |=
874 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
875 break;
876 case MLXCX_TIR_ROLE_UDPv4:
877 case MLXCX_TIR_ROLE_UDPv6:
878 tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP;
879 tir->mltir_hash_fields |=
880 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT;
881 break;
882 }
883
884 if (!mlxcx_cmd_create_tir(mlxp, tir)) {
885 mutex_exit(&g->mlg_mtx);
886 return (B_FALSE);
887 }
888
889 g->mlg_state |= MLXCX_GROUP_TIRTIS;
890 }
891
892 /*
893 * Flow table: our RX hashing breakout table for RSS
894 */
895
896 g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
897 KM_SLEEP));
898 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
899 DDI_INTR_PRI(mlxp->mlx_intr_pri));
900 avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare,
901 sizeof (mlxcx_group_mac_t),
902 offsetof(mlxcx_group_mac_t, mlgm_group_entry));
903 g->mlg_state |= MLXCX_GROUP_FLOWS;
904
905 mutex_enter(&ft->mlft_mtx);
906
907 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
908 ft->mlft_level = 2;
909 ft->mlft_port = g->mlg_port;
910 ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT;
911 ft->mlft_nents = (1 << ft->mlft_entshift);
912 ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP);
913 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
914 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
915 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
916 offsetof(mlxcx_flow_group_t, mlfg_entry));
917
918 for (j = 0; j < ft->mlft_nents; ++j) {
919 ft->mlft_ent[j].mlfe_table = ft;
920 ft->mlft_ent[j].mlfe_index = j;
921 }
922
923 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
924 mutex_exit(&ft->mlft_mtx);
925 mutex_exit(&g->mlg_mtx);
926 return (B_FALSE);
927 }
928
929 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
930 list_insert_tail(&ft->mlft_groups, fg);
931 fg->mlfg_table = ft;
932 fg->mlfg_size = 1;
933 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
934 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
935 mutex_exit(&ft->mlft_mtx);
936 mutex_exit(&g->mlg_mtx);
937 return (B_FALSE);
938 }
939 fe = list_head(&fg->mlfg_entries);
940 fe->mlfe_ip_version = 6;
941 fe->mlfe_ip_proto = IPPROTO_UDP;
942 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
943 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
944 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6];
945 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
946 mutex_exit(&ft->mlft_mtx);
947 mutex_exit(&g->mlg_mtx);
948 return (B_FALSE);
949 }
950
951 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
952 list_insert_tail(&ft->mlft_groups, fg);
953 fg->mlfg_table = ft;
954 fg->mlfg_size = 1;
955 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
956 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
957 mutex_exit(&ft->mlft_mtx);
958 mutex_exit(&g->mlg_mtx);
959 return (B_FALSE);
960 }
961 fe = list_head(&fg->mlfg_entries);
962 fe->mlfe_ip_version = 4;
963 fe->mlfe_ip_proto = IPPROTO_UDP;
964 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
965 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
966 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4];
967 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
968 mutex_exit(&ft->mlft_mtx);
969 mutex_exit(&g->mlg_mtx);
970 return (B_FALSE);
971 }
972
973 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
974 list_insert_tail(&ft->mlft_groups, fg);
975 fg->mlfg_table = ft;
976 fg->mlfg_size = 1;
977 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
978 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
979 mutex_exit(&ft->mlft_mtx);
980 mutex_exit(&g->mlg_mtx);
981 return (B_FALSE);
982 }
983 fe = list_head(&fg->mlfg_entries);
984 fe->mlfe_ip_version = 6;
985 fe->mlfe_ip_proto = IPPROTO_TCP;
986 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
987 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
988 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6];
989 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
990 mutex_exit(&ft->mlft_mtx);
991 mutex_exit(&g->mlg_mtx);
992 return (B_FALSE);
993 }
994
995 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
996 list_insert_tail(&ft->mlft_groups, fg);
997 fg->mlfg_table = ft;
998 fg->mlfg_size = 1;
999 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO;
1000 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1001 mutex_exit(&ft->mlft_mtx);
1002 mutex_exit(&g->mlg_mtx);
1003 return (B_FALSE);
1004 }
1005 fe = list_head(&fg->mlfg_entries);
1006 fe->mlfe_ip_version = 4;
1007 fe->mlfe_ip_proto = IPPROTO_TCP;
1008 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1009 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1010 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4];
1011 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1012 mutex_exit(&ft->mlft_mtx);
1013 mutex_exit(&g->mlg_mtx);
1014 return (B_FALSE);
1015 }
1016
1017 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1018 list_insert_tail(&ft->mlft_groups, fg);
1019 fg->mlfg_table = ft;
1020 fg->mlfg_size = 1;
1021 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1022 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1023 mutex_exit(&ft->mlft_mtx);
1024 mutex_exit(&g->mlg_mtx);
1025 return (B_FALSE);
1026 }
1027 fe = list_head(&fg->mlfg_entries);
1028 fe->mlfe_ip_version = 6;
1029 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1030 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1031 &g->mlg_tir[MLXCX_TIR_ROLE_IPv6];
1032 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1033 mutex_exit(&ft->mlft_mtx);
1034 mutex_exit(&g->mlg_mtx);
1035 return (B_FALSE);
1036 }
1037
1038 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1039 list_insert_tail(&ft->mlft_groups, fg);
1040 fg->mlfg_table = ft;
1041 fg->mlfg_size = 1;
1042 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER;
1043 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1044 mutex_exit(&ft->mlft_mtx);
1045 mutex_exit(&g->mlg_mtx);
1046 return (B_FALSE);
1047 }
1048 fe = list_head(&fg->mlfg_entries);
1049 fe->mlfe_ip_version = 4;
1050 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1051 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1052 &g->mlg_tir[MLXCX_TIR_ROLE_IPv4];
1053 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1054 mutex_exit(&ft->mlft_mtx);
1055 mutex_exit(&g->mlg_mtx);
1056 return (B_FALSE);
1057 }
1058
1059 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1060 list_insert_tail(&ft->mlft_groups, fg);
1061 fg->mlfg_table = ft;
1062 fg->mlfg_size = 1;
1063 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1064 mutex_exit(&ft->mlft_mtx);
1065 mutex_exit(&g->mlg_mtx);
1066 return (B_FALSE);
1067 }
1068 fe = list_head(&fg->mlfg_entries);
1069 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1070 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir =
1071 &g->mlg_tir[MLXCX_TIR_ROLE_OTHER];
1072 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1073 mutex_exit(&ft->mlft_mtx);
1074 mutex_exit(&g->mlg_mtx);
1075 return (B_FALSE);
1076 }
1077
1078 mutex_exit(&ft->mlft_mtx);
1079
1080 /*
1081 * Flow table: the VLAN breakout table for doing VLAN filtering after
1082 * we've matched a MAC address.
1083 */
1084
1085 g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1086 KM_SLEEP));
1087 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1088 DDI_INTR_PRI(mlxp->mlx_intr_pri));
1089 list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t),
1090 offsetof(mlxcx_group_vlan_t, mlgv_entry));
1091
1092 mutex_enter(&ft->mlft_mtx);
1093
1094 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1095 ft->mlft_level = 1;
1096 ft->mlft_port = g->mlg_port;
1097 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift;
1098 ft->mlft_nents = (1 << ft->mlft_entshift);
1099 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1100 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1101 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1102 offsetof(mlxcx_flow_group_t, mlfg_entry));
1103
1104 for (j = 0; j < ft->mlft_nents; ++j) {
1105 fe = &ft->mlft_ent[j];
1106 fe->mlfe_table = ft;
1107 fe->mlfe_index = j;
1108 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1109 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1110 }
1111
1112 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1113 mutex_exit(&ft->mlft_mtx);
1114 mutex_exit(&g->mlg_mtx);
1115 return (B_FALSE);
1116 }
1117
1118 /* First group is all actual matched VLANs */
1119 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1120 g->mlg_rx_vlan_fg = fg;
1121 list_insert_tail(&ft->mlft_groups, fg);
1122 fg->mlfg_table = ft;
1123 fg->mlfg_size = ft->mlft_nents - 2;
1124 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN;
1125 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID;
1126 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1127 mutex_exit(&ft->mlft_mtx);
1128 mutex_exit(&g->mlg_mtx);
1129 return (B_FALSE);
1130 }
1131
1132 /*
1133 * Then the "default" entry which we enable when we have no VLAN IDs
1134 * added to the group (we start with this enabled).
1135 */
1136 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1137 g->mlg_rx_vlan_def_fg = fg;
1138 list_insert_tail(&ft->mlft_groups, fg);
1139 fg->mlfg_table = ft;
1140 fg->mlfg_size = 1;
1141 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1142 mutex_exit(&ft->mlft_mtx);
1143 mutex_exit(&g->mlg_mtx);
1144 return (B_FALSE);
1145 }
1146 fe = list_head(&fg->mlfg_entries);
1147 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1148 mutex_exit(&ft->mlft_mtx);
1149 mutex_exit(&g->mlg_mtx);
1150 return (B_FALSE);
1151 }
1152
1153 /*
1154 * Finally, the promisc entry which points at the *hash ft* from the
1155 * default group. We only enable this when we have promisc on.
1156 */
1157 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1158 g->mlg_rx_vlan_promisc_fg = fg;
1159 list_insert_tail(&ft->mlft_groups, fg);
1160 fg->mlfg_table = ft;
1161 fg->mlfg_size = 1;
1162 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1163 mutex_exit(&ft->mlft_mtx);
1164 mutex_exit(&g->mlg_mtx);
1165 return (B_FALSE);
1166 }
1167 fe = list_head(&fg->mlfg_entries);
1168 fe->mlfe_ndest = 1;
1169 fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft;
1170
1171 mutex_exit(&ft->mlft_mtx);
1172
1173 mutex_exit(&g->mlg_mtx);
1174
1175 return (B_TRUE);
1176 }
1177
1178 boolean_t
mlxcx_rx_ring_start(mlxcx_t * mlxp,mlxcx_ring_group_t * g,mlxcx_work_queue_t * rq)1179 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1180 mlxcx_work_queue_t *rq)
1181 {
1182 uint_t j;
1183 mlxcx_buffer_t *b;
1184 mlxcx_completion_queue_t *cq;
1185
1186 mutex_enter(&g->mlg_mtx);
1187 /*
1188 * Sadly, even though MAC has the mgi_start callback, it is not always
1189 * called -- in particular when we are being managed under an aggr, the
1190 * mgi_start callback will only ever be called on the default group.
1191 *
1192 * So instead of asserting about the group state here, we have to
1193 * check it and call group start if needed.
1194 */
1195 if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) {
1196 mutex_exit(&g->mlg_mtx);
1197 if (!mlxcx_rx_group_start(mlxp, g))
1198 return (B_FALSE);
1199 mutex_enter(&g->mlg_mtx);
1200 }
1201 ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING);
1202
1203 cq = rq->mlwq_cq;
1204 ASSERT(cq != NULL);
1205
1206 mutex_enter(&cq->mlcq_mtx);
1207 mutex_enter(&rq->mlwq_mtx);
1208
1209 if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1210 mutex_exit(&rq->mlwq_mtx);
1211 mutex_exit(&cq->mlcq_mtx);
1212 mutex_exit(&g->mlg_mtx);
1213 return (B_TRUE);
1214 }
1215
1216 if (!mlxcx_cmd_start_rq(mlxp, rq)) {
1217 mutex_exit(&rq->mlwq_mtx);
1218 mutex_exit(&cq->mlcq_mtx);
1219 mutex_exit(&g->mlg_mtx);
1220 return (B_FALSE);
1221 }
1222 ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED);
1223
1224 ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS);
1225 rq->mlwq_state |= MLXCX_WQ_BUFFERS;
1226
1227 mlxcx_shard_ready(rq->mlwq_bufs);
1228
1229 for (j = 0; j < rq->mlwq_nents; ++j) {
1230 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1231 break;
1232 mlxcx_buf_return(mlxp, b);
1233 }
1234 for (j = 0; j < rq->mlwq_nents / 2; ++j) {
1235 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b))
1236 break;
1237 mlxcx_buf_return(mlxp, b);
1238 }
1239
1240 mlxcx_rq_refill(mlxp, rq);
1241
1242 mutex_exit(&rq->mlwq_mtx);
1243 mutex_exit(&cq->mlcq_mtx);
1244 mutex_exit(&g->mlg_mtx);
1245
1246 return (B_TRUE);
1247 }
1248
1249 boolean_t
mlxcx_rx_group_start(mlxcx_t * mlxp,mlxcx_ring_group_t * g)1250 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1251 {
1252 mlxcx_flow_table_t *ft;
1253 mlxcx_flow_group_t *fg;
1254 mlxcx_flow_entry_t *fe;
1255 char tq_name[TASKQ_NAMELEN];
1256
1257 mutex_enter(&g->mlg_mtx);
1258
1259 if (g->mlg_state & MLXCX_GROUP_RUNNING) {
1260 mutex_exit(&g->mlg_mtx);
1261 return (B_TRUE);
1262 }
1263
1264 ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING);
1265
1266 g->mlg_state |= MLXCX_GROUP_RUNNING;
1267
1268 (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
1269 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
1270 g - &mlxp->mlx_rx_groups[0]);
1271
1272 /*
1273 * Create one refill taskq per group with one thread per work queue.
1274 * The refill task may block waiting for resources, so by effectively
1275 * having one thread per work queue we avoid work queues blocking each
1276 * other.
1277 */
1278 if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
1279 g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
1280 mlxcx_warn(mlxp, "failed to create rq refill task queue");
1281 mutex_exit(&g->mlg_mtx);
1282 return (B_FALSE);
1283 }
1284
1285 if (g == &mlxp->mlx_rx_groups[0]) {
1286 ft = g->mlg_port->mlp_rx_flow;
1287 mutex_enter(&ft->mlft_mtx);
1288
1289 /*
1290 * Broadcast and promisc entries go directly to group 0's
1291 * RSS hash fanout flow table. They bypass VLAN filtering.
1292 */
1293 fg = g->mlg_port->mlp_bcast;
1294 fe = list_head(&fg->mlfg_entries);
1295 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1296 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1297 mutex_exit(&ft->mlft_mtx);
1298 g->mlg_state &= ~MLXCX_GROUP_RUNNING;
1299 taskq_destroy(g->mlg_refill_tq);
1300 mutex_exit(&g->mlg_mtx);
1301 return (B_FALSE);
1302 }
1303
1304 fg = g->mlg_port->mlp_promisc;
1305 fe = list_head(&fg->mlfg_entries);
1306 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
1307 /*
1308 * Don't actually set the promisc entry until promisc is
1309 * enabled.
1310 */
1311
1312 mutex_exit(&ft->mlft_mtx);
1313 }
1314
1315 mutex_exit(&g->mlg_mtx);
1316
1317 return (B_TRUE);
1318 }
1319
1320 boolean_t
mlxcx_tx_group_setup(mlxcx_t * mlxp,mlxcx_ring_group_t * g)1321 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1322 {
1323 mlxcx_event_queue_t *eq;
1324 mlxcx_completion_queue_t *cq;
1325 mlxcx_work_queue_t *sq;
1326 uint_t i;
1327
1328 ASSERT3S(g->mlg_state, ==, 0);
1329
1330 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER,
1331 DDI_INTR_PRI(mlxp->mlx_intr_pri));
1332 g->mlg_state |= MLXCX_GROUP_INIT;
1333 mutex_enter(&g->mlg_mtx);
1334
1335 g->mlg_mlx = mlxp;
1336 g->mlg_type = MLXCX_GROUP_TX;
1337 g->mlg_port = &mlxp->mlx_ports[0];
1338
1339 g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group;
1340 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t);
1341 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP);
1342 g->mlg_state |= MLXCX_GROUP_WQS;
1343
1344 g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom;
1345
1346 if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) {
1347 mutex_exit(&g->mlg_mtx);
1348 return (B_FALSE);
1349 }
1350
1351 g->mlg_state |= MLXCX_GROUP_TIRTIS;
1352
1353 for (i = 0; i < g->mlg_nwqs; ++i) {
1354 eq = NULL;
1355 while (eq == NULL) {
1356 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++];
1357 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count)
1358 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
1359 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY &&
1360 eq->mleq_type != MLXCX_EQ_TYPE_TX) {
1361 /* Try the next one */
1362 eq = NULL;
1363 }
1364 }
1365
1366 if (!mlxcx_cq_setup(mlxp, eq, &cq,
1367 mlxp->mlx_props.mldp_cq_size_shift))
1368 return (B_FALSE);
1369
1370 cq->mlcq_stats = &g->mlg_port->mlp_stats;
1371
1372 sq = &g->mlg_wqs[i];
1373 if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) {
1374 mutex_exit(&g->mlg_mtx);
1375 return (B_FALSE);
1376 }
1377 sq->mlwq_group = g;
1378 }
1379
1380 mutex_exit(&g->mlg_mtx);
1381
1382 return (B_TRUE);
1383 }
1384
1385 boolean_t
mlxcx_tx_ring_start(mlxcx_t * mlxp,mlxcx_ring_group_t * g,mlxcx_work_queue_t * sq)1386 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1387 mlxcx_work_queue_t *sq)
1388 {
1389 uint_t i;
1390 mlxcx_buffer_t *b;
1391 mlxcx_completion_queue_t *cq;
1392
1393 mutex_enter(&g->mlg_mtx);
1394
1395 cq = sq->mlwq_cq;
1396 ASSERT(cq != NULL);
1397
1398 mutex_enter(&cq->mlcq_mtx);
1399 mutex_enter(&sq->mlwq_mtx);
1400 if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1401 mutex_exit(&sq->mlwq_mtx);
1402 mutex_exit(&cq->mlcq_mtx);
1403 mutex_exit(&g->mlg_mtx);
1404 return (B_TRUE);
1405 }
1406
1407 ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS);
1408 for (i = 0; i < sq->mlwq_nents; ++i) {
1409 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1410 break;
1411 mlxcx_buf_return(mlxp, b);
1412 }
1413 for (i = 0; i < sq->mlwq_nents / 2; ++i) {
1414 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b))
1415 break;
1416 mlxcx_buf_return(mlxp, b);
1417 }
1418 for (i = 0; i < sq->mlwq_nents; ++i) {
1419 if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b))
1420 break;
1421 mlxcx_buf_return(mlxp, b);
1422 }
1423 sq->mlwq_state |= MLXCX_WQ_BUFFERS;
1424
1425 mlxcx_shard_ready(sq->mlwq_bufs);
1426 mlxcx_shard_ready(sq->mlwq_foreign_bufs);
1427
1428 if (!mlxcx_cmd_start_sq(mlxp, sq)) {
1429 mutex_exit(&sq->mlwq_mtx);
1430 mutex_exit(&cq->mlcq_mtx);
1431 mutex_exit(&g->mlg_mtx);
1432 return (B_FALSE);
1433 }
1434 g->mlg_state |= MLXCX_GROUP_RUNNING;
1435
1436 (void) mlxcx_sq_add_nop(mlxp, sq);
1437
1438 mutex_exit(&sq->mlwq_mtx);
1439 mutex_exit(&cq->mlcq_mtx);
1440 mutex_exit(&g->mlg_mtx);
1441
1442 return (B_TRUE);
1443 }
1444
1445 static boolean_t
mlxcx_sq_ring_dbell(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq,uint_t first)1446 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first)
1447 {
1448 uint_t idx;
1449 mlxcx_bf_t *bf;
1450 ddi_fm_error_t err;
1451 uint_t try = 0;
1452
1453 ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ);
1454 ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1455
1456 mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc);
1457
1458 ASSERT(mlwq->mlwq_cq != NULL);
1459 ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL);
1460 idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK;
1461 bf = &mlwq->mlwq_uar->mlu_bf[idx];
1462
1463 retry:
1464 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1465 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1466 DDI_FME_VERSION);
1467 if (err.fme_status != DDI_FM_OK) {
1468 if (try++ < mlxcx_doorbell_tries) {
1469 ddi_fm_dma_err_clear(
1470 mlwq->mlwq_doorbell_dma.mxdb_dma_handle,
1471 DDI_FME_VERSION);
1472 goto retry;
1473 } else {
1474 goto err;
1475 }
1476 }
1477
1478 mlxcx_put64(mlxp, bf->mbf_even, from_be64(
1479 mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0]));
1480 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
1481 DDI_FME_VERSION);
1482 if (err.fme_status == DDI_FM_OK)
1483 return (B_TRUE);
1484 if (try++ < mlxcx_doorbell_tries) {
1485 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
1486 goto retry;
1487 }
1488
1489 err:
1490 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1491 return (B_FALSE);
1492 }
1493
1494 boolean_t
mlxcx_sq_add_nop(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq)1495 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1496 {
1497 uint_t index, start_pc;
1498 mlxcx_sendq_ent_t *ent0;
1499 ddi_fm_error_t err;
1500
1501 ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1502
1503 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1504 ent0 = &mlwq->mlwq_send_ent[index];
1505 start_pc = mlwq->mlwq_pc;
1506 ++mlwq->mlwq_pc;
1507 /*
1508 * This counter is manipulated in the interrupt handler, which
1509 * does not hold the mlwq_mtx, hence the atomic.
1510 */
1511 atomic_inc_64(&mlwq->mlwq_wqebb_used);
1512
1513 bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1514 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
1515 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1516 ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc);
1517
1518 set_bits8(&ent0->mlsqe_control.mlcs_flags,
1519 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE);
1520 set_bits8(&ent0->mlsqe_control.mlcs_flags,
1521 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1522
1523 ent0->mlsqe_control.mlcs_ds = 1;
1524
1525 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1526 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1527 sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1528 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1529 DDI_FME_VERSION);
1530 if (err.fme_status != DDI_FM_OK) {
1531 return (B_FALSE);
1532 }
1533 if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) {
1534 return (B_FALSE);
1535 }
1536 return (B_TRUE);
1537 }
1538
1539 boolean_t
mlxcx_sq_add_buffer(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq,uint8_t * inlinehdrs,size_t inlinelen,uint32_t chkflags,mlxcx_buffer_t * b0)1540 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1541 uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
1542 mlxcx_buffer_t *b0)
1543 {
1544 uint_t index, first, ents;
1545 mlxcx_completion_queue_t *cq;
1546 mlxcx_sendq_ent_t *ent0;
1547 mlxcx_sendq_extra_ent_t *ent;
1548 mlxcx_wqe_data_seg_t *seg;
1549 uint_t ptri, nptr;
1550 const ddi_dma_cookie_t *c;
1551 size_t rem;
1552 uint64_t wqebb_used;
1553 mlxcx_buffer_t *b;
1554 ddi_fm_error_t err;
1555 boolean_t rv;
1556
1557 ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1558 ASSERT3P(b0->mlb_tx_head, ==, b0);
1559 ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1560 cq = mlwq->mlwq_cq;
1561
1562 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1563 ent0 = &mlwq->mlwq_send_ent[index];
1564 b0->mlb_wqe_index = mlwq->mlwq_pc;
1565 ents = 1;
1566
1567 first = index;
1568
1569 bzero(ent0, sizeof (mlxcx_sendq_ent_t));
1570 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
1571 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
1572 ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index);
1573
1574 set_bits8(&ent0->mlsqe_control.mlcs_flags,
1575 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS);
1576 set_bits8(&ent0->mlsqe_control.mlcs_flags,
1577 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS);
1578
1579 VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers));
1580 set_bits16(&ent0->mlsqe_eth.mles_szflags,
1581 MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen);
1582 if (inlinelen > 0) {
1583 bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers,
1584 inlinelen);
1585 }
1586
1587 ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) /
1588 MLXCX_WQE_OCTOWORD;
1589
1590 if (chkflags & HCK_IPV4_HDRCKSUM) {
1591 ASSERT(mlxp->mlx_caps->mlc_checksum);
1592 set_bit8(&ent0->mlsqe_eth.mles_csflags,
1593 MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM);
1594 }
1595 if (chkflags & HCK_FULLCKSUM) {
1596 ASSERT(mlxp->mlx_caps->mlc_checksum);
1597 set_bit8(&ent0->mlsqe_eth.mles_csflags,
1598 MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
1599 }
1600
1601 /*
1602 * mlwq_wqebb_used is only incremented whilst holding
1603 * the mlwq_mtx mutex, but it is decremented (atomically) in
1604 * the interrupt context *not* under mlwq_mtx mutex.
1605 * So, now take a snapshot of the number of used wqes which will
1606 * be a conistent maximum we can use whilst iterating through
1607 * the buffers and DMA cookies.
1608 */
1609 wqebb_used = mlwq->mlwq_wqebb_used;
1610
1611 b = b0;
1612 ptri = 0;
1613 nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
1614 seg = ent0->mlsqe_data;
1615 while (b != NULL) {
1616 rem = b->mlb_used;
1617
1618 c = NULL;
1619 while (rem > 0 &&
1620 (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
1621 if (ptri >= nptr) {
1622 if ((ents + wqebb_used) >= mlwq->mlwq_nents)
1623 return (B_FALSE);
1624
1625 index = (mlwq->mlwq_pc + ents) &
1626 (mlwq->mlwq_nents - 1);
1627 ent = &mlwq->mlwq_send_extra_ent[index];
1628 ++ents;
1629
1630 seg = ent->mlsqe_data;
1631 ptri = 0;
1632 nptr = sizeof (ent->mlsqe_data) /
1633 sizeof (mlxcx_wqe_data_seg_t);
1634 }
1635
1636 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1637 if (c->dmac_size > rem) {
1638 seg->mlds_byte_count = to_be32(rem);
1639 rem = 0;
1640 } else {
1641 seg->mlds_byte_count = to_be32(c->dmac_size);
1642 rem -= c->dmac_size;
1643 }
1644 seg->mlds_address = to_be64(c->dmac_laddress);
1645 ++seg;
1646 ++ptri;
1647 ++ent0->mlsqe_control.mlcs_ds;
1648
1649 ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=,
1650 MLXCX_SQE_MAX_DS);
1651 }
1652
1653 if (b == b0) {
1654 b = list_head(&b0->mlb_tx_chain);
1655 } else {
1656 b = list_next(&b0->mlb_tx_chain, b);
1657 }
1658 }
1659
1660 b0->mlb_wqebbs = ents;
1661 mlwq->mlwq_pc += ents;
1662 atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
1663
1664 for (; ptri < nptr; ++ptri, ++seg) {
1665 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1666 seg->mlds_byte_count = to_be32(0);
1667 seg->mlds_address = to_be64(0);
1668 }
1669
1670 /*
1671 * Make sure the workqueue entry is flushed out before updating
1672 * the doorbell.
1673 * If the ring has wrapped, we need to flush the front and back.
1674 */
1675 if ((first + ents) > mlwq->mlwq_nents) {
1676 uint_t sync_cnt = mlwq->mlwq_nents - first;
1677
1678 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1679 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1680 sync_cnt * sizeof (mlxcx_sendq_ent_t),
1681 DDI_DMA_SYNC_FORDEV));
1682
1683 ent0 = &mlwq->mlwq_send_ent[0];
1684 ents -= sync_cnt;
1685 }
1686
1687 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1688 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
1689 ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
1690 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1691 DDI_FME_VERSION);
1692 if (err.fme_status != DDI_FM_OK) {
1693 return (B_FALSE);
1694 }
1695
1696 /*
1697 * Hold the bufmtx whilst ringing the doorbell, to prevent
1698 * the buffer from being moved to another list, so we can
1699 * safely remove it should the ring fail.
1700 */
1701 mutex_enter(&cq->mlcq_bufbmtx);
1702
1703 list_insert_tail(&cq->mlcq_buffers_b, b0);
1704 if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
1705 atomic_inc_64(&cq->mlcq_bufcnt);
1706 } else {
1707 list_remove(&cq->mlcq_buffers_b, b0);
1708 }
1709
1710 mutex_exit(&cq->mlcq_bufbmtx);
1711
1712 return (rv);
1713 }
1714
1715 boolean_t
mlxcx_rq_add_buffer(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq,mlxcx_buffer_t * buf)1716 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1717 mlxcx_buffer_t *buf)
1718 {
1719 return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1));
1720 }
1721
1722 boolean_t
mlxcx_rq_add_buffers(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq,mlxcx_buffer_t ** bufs,size_t nbufs)1723 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
1724 mlxcx_buffer_t **bufs, size_t nbufs)
1725 {
1726 uint_t index;
1727 mlxcx_recvq_ent_t *ent;
1728 mlxcx_completion_queue_t *cq;
1729 mlxcx_wqe_data_seg_t *seg;
1730 uint_t bi, ptri;
1731 const ddi_dma_cookie_t *c;
1732 mlxcx_buffer_t *buf;
1733 ddi_fm_error_t err;
1734
1735 ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1736 cq = mlwq->mlwq_cq;
1737 ASSERT(mutex_owned(&cq->mlcq_mtx));
1738
1739 for (bi = 0; bi < nbufs; ++bi) {
1740 buf = bufs[bi];
1741 bufs[bi] = NULL;
1742 ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
1743
1744 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
1745 ent = &mlwq->mlwq_recv_ent[index];
1746 buf->mlb_wqe_index = mlwq->mlwq_pc;
1747 buf->mlb_wqebbs = 1;
1748
1749 ++mlwq->mlwq_pc;
1750 atomic_inc_64(&mlwq->mlwq_wqebb_used);
1751
1752 mutex_enter(&cq->mlcq_bufbmtx);
1753 list_insert_tail(&cq->mlcq_buffers, buf);
1754 atomic_inc_64(&cq->mlcq_bufcnt);
1755 mutex_exit(&cq->mlcq_bufbmtx);
1756
1757 ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS);
1758 ptri = 0;
1759 c = NULL;
1760 while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) {
1761 seg = &ent->mlrqe_data[ptri++];
1762 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey);
1763 seg->mlds_byte_count = to_be32(c->dmac_size);
1764 seg->mlds_address = to_be64(c->dmac_laddress);
1765 }
1766 /*
1767 * Fill any unused scatter pointers with the special null
1768 * value.
1769 */
1770 for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) {
1771 seg = &ent->mlrqe_data[ptri];
1772 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
1773 seg->mlds_byte_count = to_be32(0);
1774 seg->mlds_address = to_be64(0);
1775 }
1776
1777 /*
1778 * Make sure the workqueue entry is flushed out before updating
1779 * the doorbell.
1780 */
1781 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
1782 (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent,
1783 sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV));
1784 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err,
1785 DDI_FME_VERSION);
1786 if (err.fme_status != DDI_FM_OK) {
1787 return (B_FALSE);
1788 }
1789 }
1790
1791 mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc);
1792 /*
1793 * Flush the CQ doorbell as well so that HW knows how many
1794 * completions we've consumed.
1795 */
1796 MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1797 ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
1798 DDI_FME_VERSION);
1799 if (err.fme_status != DDI_FM_OK) {
1800 return (B_FALSE);
1801 }
1802 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
1803 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err,
1804 DDI_FME_VERSION);
1805 if (err.fme_status != DDI_FM_OK) {
1806 return (B_FALSE);
1807 }
1808 return (B_TRUE);
1809 }
1810
1811 static void
mlxcx_rq_refill_task(void * arg)1812 mlxcx_rq_refill_task(void *arg)
1813 {
1814 mlxcx_work_queue_t *wq = arg;
1815 mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1816 mlxcx_t *mlxp = wq->mlwq_mlx;
1817 mlxcx_buf_shard_t *s = wq->mlwq_bufs;
1818 boolean_t refill, draining;
1819
1820 do {
1821 /*
1822 * Wait here until one of 3 conditions:
1823 * 1. The shard is draining, or
1824 * 2. There are buffers on the free list, or
1825 * 3. The WQ is being shut down.
1826 */
1827 mutex_enter(&s->mlbs_mtx);
1828 while (s->mlbs_state != MLXCX_SHARD_DRAINING &&
1829 list_is_empty(&s->mlbs_free) &&
1830 (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) {
1831 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
1832 }
1833
1834 draining = (s->mlbs_state == MLXCX_SHARD_DRAINING);
1835 mutex_exit(&s->mlbs_mtx);
1836
1837 mutex_enter(&cq->mlcq_mtx);
1838 mutex_enter(&wq->mlwq_mtx);
1839
1840 if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
1841 refill = B_FALSE;
1842 wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1843 } else {
1844 mlxcx_rq_refill(mlxp, wq);
1845
1846 if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
1847 refill = B_TRUE;
1848 } else {
1849 refill = B_FALSE;
1850 wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
1851 }
1852 }
1853
1854 mutex_exit(&wq->mlwq_mtx);
1855 mutex_exit(&cq->mlcq_mtx);
1856 } while (refill);
1857 }
1858
1859 void
mlxcx_rq_refill(mlxcx_t * mlxp,mlxcx_work_queue_t * mlwq)1860 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
1861 {
1862 size_t target, current, want, done, n;
1863 mlxcx_completion_queue_t *cq;
1864 mlxcx_ring_group_t *g;
1865 mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
1866 uint_t i;
1867
1868 ASSERT(mutex_owned(&mlwq->mlwq_mtx));
1869 cq = mlwq->mlwq_cq;
1870 ASSERT(mutex_owned(&cq->mlcq_mtx));
1871
1872 ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS);
1873
1874 target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP;
1875 cq = mlwq->mlwq_cq;
1876
1877 if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0)
1878 return;
1879
1880 if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0)
1881 return;
1882
1883 current = cq->mlcq_bufcnt;
1884
1885 if (current >= target - MLXCX_RQ_REFILL_STEP)
1886 return;
1887
1888 want = target - current;
1889 done = 0;
1890
1891 while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
1892 n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
1893 if (n == 0) {
1894 /*
1895 * We didn't get any buffers from the free queue.
1896 * It might not be an issue, schedule a taskq
1897 * to wait for free buffers if the completion
1898 * queue is low.
1899 */
1900 if (current < MLXCX_RQ_REFILL_STEP &&
1901 (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
1902 mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
1903 g = mlwq->mlwq_group;
1904 taskq_dispatch_ent(g->mlg_refill_tq,
1905 mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
1906 &mlwq->mlwq_tqe);
1907 }
1908
1909 return;
1910 }
1911
1912 if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) {
1913 for (i = 0; i < n; ++i)
1914 mlxcx_buf_return(mlxp, b[i]);
1915 return;
1916 }
1917 if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) {
1918 /*
1919 * mlxcx_rq_add_buffers NULLs out the buffers as it
1920 * enqueues them, so any that are non-NULL we have to
1921 * free now. The others now belong to the WQ, even if
1922 * we failed.
1923 */
1924 for (i = 0; i < n; ++i) {
1925 if (b[i] != NULL) {
1926 mlxcx_buf_return(mlxp, b[i]);
1927 }
1928 }
1929 return;
1930 }
1931 done += n;
1932 }
1933 }
1934
1935 static const char *
mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)1936 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy)
1937 {
1938 switch (sy) {
1939 case MLXCX_CQ_ERR_LOCAL_LENGTH:
1940 return ("LOCAL_LENGTH");
1941 case MLXCX_CQ_ERR_LOCAL_QP_OP:
1942 return ("LOCAL_QP_OP");
1943 case MLXCX_CQ_ERR_LOCAL_PROTECTION:
1944 return ("LOCAL_PROTECTION");
1945 case MLXCX_CQ_ERR_WR_FLUSHED:
1946 return ("WR_FLUSHED");
1947 case MLXCX_CQ_ERR_MEM_WINDOW_BIND:
1948 return ("MEM_WINDOW_BIND");
1949 case MLXCX_CQ_ERR_BAD_RESPONSE:
1950 return ("BAD_RESPONSE");
1951 case MLXCX_CQ_ERR_LOCAL_ACCESS:
1952 return ("LOCAL_ACCESS");
1953 case MLXCX_CQ_ERR_XPORT_RETRY_CTR:
1954 return ("XPORT_RETRY_CTR");
1955 case MLXCX_CQ_ERR_RNR_RETRY_CTR:
1956 return ("RNR_RETRY_CTR");
1957 case MLXCX_CQ_ERR_ABORTED:
1958 return ("ABORTED");
1959 default:
1960 return ("UNKNOWN");
1961 }
1962 }
1963
1964 static void
mlxcx_fm_cqe_ereport(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,mlxcx_completionq_error_ent_t * ent)1965 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1966 mlxcx_completionq_error_ent_t *ent)
1967 {
1968 uint64_t ena;
1969 char buf[FM_MAX_CLASS];
1970 const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome);
1971
1972 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1973 return;
1974
1975 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1976 MLXCX_FM_SERVICE_MLXCX, "cqe.err");
1977 ena = fm_ena_generate(0, FM_ENA_FMT1);
1978
1979 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1980 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1981 "syndrome", DATA_TYPE_STRING, name,
1982 "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome,
1983 "vendor_syndrome", DATA_TYPE_UINT8,
1984 ent->mlcqee_vendor_error_syndrome,
1985 "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter),
1986 "wq_type", DATA_TYPE_STRING,
1987 (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv",
1988 "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num,
1989 "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num,
1990 NULL);
1991 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1992 }
1993
1994 void
mlxcx_tx_completion(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,mlxcx_completionq_ent_t * ent,mlxcx_buffer_t * buf)1995 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
1996 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
1997 {
1998 ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1999 if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) {
2000 mlxcx_completionq_error_ent_t *eent =
2001 (mlxcx_completionq_error_ent_t *)ent;
2002 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2003 mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2004 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2005 mlxcx_check_sq(mlxp, mlcq->mlcq_wq);
2006 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2007 return;
2008 }
2009
2010 if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) {
2011 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2012 mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2013 return;
2014 }
2015
2016 if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) {
2017 mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x",
2018 ent->mlcqe_send_wqe_opcode);
2019 mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2020 return;
2021 }
2022
2023 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2024 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2025 mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2026 return;
2027 }
2028
2029 mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
2030 }
2031
2032 mblk_t *
mlxcx_rx_completion(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,mlxcx_completionq_ent_t * ent,mlxcx_buffer_t * buf)2033 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
2034 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
2035 {
2036 uint32_t chkflags = 0;
2037 uint_t wqe_index, used;
2038 ddi_fm_error_t err;
2039 mblk_t *mp;
2040
2041 ASSERT(mutex_owned(&mlcq->mlcq_mtx));
2042
2043 if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) {
2044 mlxcx_completionq_error_ent_t *eent =
2045 (mlxcx_completionq_error_ent_t *)ent;
2046 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent);
2047 mlxcx_buf_return(mlxp, buf);
2048 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx);
2049 mlxcx_check_rq(mlxp, mlcq->mlcq_wq);
2050 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx);
2051 return (NULL);
2052 }
2053
2054 if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) {
2055 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode);
2056 mlxcx_buf_return(mlxp, buf);
2057 return (NULL);
2058 }
2059
2060 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) {
2061 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format);
2062 mlxcx_buf_return(mlxp, buf);
2063 return (NULL);
2064 }
2065
2066 if (ent->mlcqe_rx_drop_counter > 0) {
2067 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops,
2068 ent->mlcqe_rx_drop_counter);
2069 }
2070
2071 MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU);
2072 ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err,
2073 DDI_FME_VERSION);
2074 if (err.fme_status != DDI_FM_OK) {
2075 ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle,
2076 DDI_FME_VERSION);
2077 mlxcx_buf_return(mlxp, buf);
2078 return (NULL);
2079 }
2080
2081 /*
2082 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
2083 * Remember it for later.
2084 */
2085 wqe_index = buf->mlb_wqe_index;
2086
2087 /* Set the used field with the actual length of the packet. */
2088 buf->mlb_used = (used = from_be32(ent->mlcqe_byte_cnt));
2089
2090 /* Try to loan this buffer to MAC directly. */
2091 if (mlxcx_buf_loan(mlxp, buf)) {
2092 mp = buf->mlb_mp;
2093
2094 } else {
2095 /*
2096 * Loan rejected: we will try to allocate a new mblk and copy
2097 * this packet for MAC instead.
2098 */
2099 mp = allocb(buf->mlb_used, 0);
2100 if (mp == NULL) {
2101 /* No memory :( */
2102 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 1);
2103 mlxcx_buf_return(mlxp, buf);
2104 return (NULL);
2105 }
2106 bcopy((unsigned char *)buf->mlb_dma.mxdb_va, mp->b_rptr,
2107 buf->mlb_used);
2108
2109 /* We're done with this buf now, return it to the free list. */
2110 mlxcx_buf_return(mlxp, buf);
2111 buf = NULL;
2112 }
2113
2114 mp->b_next = NULL;
2115 mp->b_cont = NULL;
2116 mp->b_wptr = mp->b_rptr + used;
2117
2118 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) {
2119 chkflags |= HCK_FULLCKSUM_OK;
2120 }
2121 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) {
2122 chkflags |= HCK_IPV4_HDRCKSUM_OK;
2123 }
2124 if (chkflags != 0) {
2125 mac_hcksum_set(mp, 0, 0, 0, from_be16(ent->mlcqe_checksum),
2126 chkflags);
2127 }
2128
2129 /*
2130 * Don't check if a refill is needed on every single completion,
2131 * since checking involves taking the RQ lock.
2132 */
2133 if ((wqe_index & 0x7) == 0) {
2134 mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
2135 ASSERT(wq != NULL);
2136 mutex_enter(&wq->mlwq_mtx);
2137 if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN))
2138 mlxcx_rq_refill(mlxp, wq);
2139 mutex_exit(&wq->mlwq_mtx);
2140 }
2141
2142 return (mp);
2143 }
2144
2145 static void
mlxcx_buf_mp_return(caddr_t arg)2146 mlxcx_buf_mp_return(caddr_t arg)
2147 {
2148 mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg;
2149 mlxcx_t *mlxp = b->mlb_mlx;
2150
2151 /* The mblk has been used now, so NULL it out. */
2152 b->mlb_mp = NULL;
2153
2154 if (b->mlb_state == MLXCX_BUFFER_ON_LOAN)
2155 mlxcx_buf_return(mlxp, b);
2156 }
2157
2158 boolean_t
mlxcx_buf_create(mlxcx_t * mlxp,mlxcx_buf_shard_t * shard,mlxcx_buffer_t ** bp)2159 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp)
2160 {
2161 mlxcx_buffer_t *b;
2162 ddi_device_acc_attr_t acc;
2163 ddi_dma_attr_t attr;
2164 boolean_t ret;
2165
2166 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2167 b->mlb_shard = shard;
2168 b->mlb_foreign = B_FALSE;
2169
2170 mlxcx_dma_acc_attr(mlxp, &acc);
2171 mlxcx_dma_buf_attr(mlxp, &attr);
2172
2173 ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc,
2174 B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE);
2175 if (!ret) {
2176 kmem_cache_free(mlxp->mlx_bufs_cache, b);
2177 return (B_FALSE);
2178 }
2179
2180 b->mlb_frtn.free_func = mlxcx_buf_mp_return;
2181 b->mlb_frtn.free_arg = (caddr_t)b;
2182 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2183 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2184
2185 *bp = b;
2186
2187 return (B_TRUE);
2188 }
2189
2190 boolean_t
mlxcx_buf_create_foreign(mlxcx_t * mlxp,mlxcx_buf_shard_t * shard,mlxcx_buffer_t ** bp)2191 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
2192 mlxcx_buffer_t **bp)
2193 {
2194 mlxcx_buffer_t *b;
2195 ddi_dma_attr_t attr;
2196 boolean_t ret;
2197
2198 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP);
2199 b->mlb_shard = shard;
2200 b->mlb_foreign = B_TRUE;
2201
2202 mlxcx_dma_buf_attr(mlxp, &attr);
2203
2204 ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE);
2205 if (!ret) {
2206 kmem_cache_free(mlxp->mlx_bufs_cache, b);
2207 return (B_FALSE);
2208 }
2209
2210 *bp = b;
2211
2212 return (B_TRUE);
2213 }
2214
2215 static mlxcx_buffer_t *
mlxcx_buf_take_foreign(mlxcx_t * mlxp,mlxcx_work_queue_t * wq)2216 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2217 {
2218 mlxcx_buffer_t *b;
2219 mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
2220
2221 mutex_enter(&s->mlbs_mtx);
2222 if (s->mlbs_state != MLXCX_SHARD_READY) {
2223 mutex_exit(&s->mlbs_mtx);
2224 return (NULL);
2225 }
2226
2227 if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2228 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2229 ASSERT(b->mlb_foreign);
2230 b->mlb_state = MLXCX_BUFFER_ON_WQ;
2231 list_insert_tail(&s->mlbs_busy, b);
2232 }
2233 mutex_exit(&s->mlbs_mtx);
2234
2235 return (b);
2236 }
2237
2238 static mlxcx_buffer_t *
mlxcx_copy_data(mlxcx_t * mlxp,mlxcx_work_queue_t * wq,uint8_t * rptr,size_t sz)2239 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
2240 {
2241 ddi_fm_error_t err;
2242 mlxcx_buffer_t *b;
2243 uint_t attempts = 0;
2244
2245 copyb:
2246 if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
2247 return (NULL);
2248
2249 ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
2250 bcopy(rptr, b->mlb_dma.mxdb_va, sz);
2251
2252 MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
2253
2254 ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
2255 DDI_FME_VERSION);
2256 if (err.fme_status != DDI_FM_OK) {
2257 ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
2258 DDI_FME_VERSION);
2259 mlxcx_buf_return(mlxp, b);
2260 if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
2261 return (NULL);
2262 }
2263 goto copyb;
2264 }
2265
2266 return (b);
2267 }
2268
2269 static mlxcx_buffer_t *
mlxcx_bind_or_copy_mblk(mlxcx_t * mlxp,mlxcx_work_queue_t * wq,mblk_t * mp,size_t off)2270 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2271 mblk_t *mp, size_t off)
2272 {
2273 mlxcx_buffer_t *b;
2274 uint8_t *rptr;
2275 size_t sz;
2276 boolean_t ret;
2277
2278 rptr = mp->b_rptr;
2279 sz = MBLKL(mp);
2280
2281 #ifdef DEBUG
2282 if (off > 0) {
2283 ASSERT3U(off, <, sz);
2284 }
2285 #endif
2286
2287 rptr += off;
2288 sz -= off;
2289
2290 if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
2291 b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2292 } else {
2293 b = mlxcx_buf_take_foreign(mlxp, wq);
2294 if (b == NULL)
2295 return (NULL);
2296
2297 ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
2298 B_FALSE);
2299
2300 if (!ret) {
2301 mlxcx_buf_return(mlxp, b);
2302
2303 b = mlxcx_copy_data(mlxp, wq, rptr, sz);
2304 }
2305 }
2306
2307 return (b);
2308 }
2309
2310 uint_t
mlxcx_buf_bind_or_copy(mlxcx_t * mlxp,mlxcx_work_queue_t * wq,mblk_t * mpb,size_t off,mlxcx_buffer_t ** bp)2311 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
2312 mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
2313 {
2314 mlxcx_buffer_t *b, *b0 = NULL;
2315 boolean_t first = B_TRUE;
2316 mblk_t *mp;
2317 size_t offset = off;
2318 size_t ncookies = 0;
2319 uint_t count = 0;
2320
2321 for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS;
2322 mp = mp->b_cont) {
2323 b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset);
2324 if (b == NULL)
2325 goto failed;
2326
2327 ncookies += b->mlb_dma.mxdb_ncookies;
2328
2329 if (first)
2330 b0 = b;
2331
2332 if (!first)
2333 b->mlb_state = MLXCX_BUFFER_ON_CHAIN;
2334
2335 b->mlb_tx_mp = mp;
2336 b->mlb_tx_head = b0;
2337 b->mlb_used = MBLKL(mp) - offset;
2338
2339 if (!first)
2340 list_insert_tail(&b0->mlb_tx_chain, b);
2341 first = B_FALSE;
2342 offset = 0;
2343
2344 count++;
2345 }
2346
2347 /*
2348 * The chain of mblks has resulted in too many cookies for
2349 * a single message. This is unusual, so take the hit to tidy
2350 * up, do a pullup to a single mblk and allocate the requisite
2351 * buf.
2352 */
2353 if (ncookies > MLXCX_SQE_MAX_PTRS) {
2354 DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq,
2355 mblk_t *, mpb, size_t, ncookies);
2356
2357 if (b0 != NULL)
2358 mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2359
2360 if ((mp = msgpullup(mpb, -1)) == NULL)
2361 return (0);
2362
2363 b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off);
2364 if (b0 == NULL) {
2365 freemsg(mp);
2366 return (0);
2367 }
2368 freemsg(mpb);
2369
2370 b0->mlb_tx_mp = mp;
2371 b0->mlb_tx_head = b0;
2372 b0->mlb_used = MBLKL(mp) - off;
2373
2374 count = 1;
2375 }
2376
2377 *bp = b0;
2378
2379 return (count);
2380
2381 failed:
2382 if (b0 != NULL)
2383 mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
2384
2385 return (0);
2386 }
2387
2388 mlxcx_buffer_t *
mlxcx_buf_take(mlxcx_t * mlxp,mlxcx_work_queue_t * wq)2389 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
2390 {
2391 mlxcx_buffer_t *b;
2392 mlxcx_buf_shard_t *s = wq->mlwq_bufs;
2393
2394 mutex_enter(&s->mlbs_mtx);
2395 if (s->mlbs_state != MLXCX_SHARD_READY) {
2396 mutex_exit(&s->mlbs_mtx);
2397 return (NULL);
2398 }
2399
2400 if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
2401 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2402 b->mlb_state = MLXCX_BUFFER_ON_WQ;
2403 list_insert_tail(&s->mlbs_busy, b);
2404 }
2405 mutex_exit(&s->mlbs_mtx);
2406
2407 return (b);
2408 }
2409
2410 size_t
mlxcx_buf_take_n(mlxcx_t * mlxp,mlxcx_work_queue_t * wq,mlxcx_buffer_t ** bp,size_t nbufs)2411 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp,
2412 size_t nbufs)
2413 {
2414 mlxcx_buffer_t *b;
2415 size_t done = 0;
2416 mlxcx_buf_shard_t *s;
2417
2418 s = wq->mlwq_bufs;
2419
2420 mutex_enter(&s->mlbs_mtx);
2421 if (s->mlbs_state != MLXCX_SHARD_READY) {
2422 mutex_exit(&s->mlbs_mtx);
2423 return (0);
2424 }
2425
2426 while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
2427 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
2428 b->mlb_state = MLXCX_BUFFER_ON_WQ;
2429 list_insert_tail(&s->mlbs_busy, b);
2430 bp[done++] = b;
2431 }
2432 mutex_exit(&s->mlbs_mtx);
2433 return (done);
2434 }
2435
2436 boolean_t
mlxcx_buf_loan(mlxcx_t * mlxp,mlxcx_buffer_t * b)2437 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2438 {
2439 mlxcx_buf_shard_t *s = b->mlb_shard;
2440
2441 VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ);
2442 ASSERT3P(b->mlb_mlx, ==, mlxp);
2443
2444 if (b->mlb_mp == NULL) {
2445 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va,
2446 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn);
2447 if (b->mlb_mp == NULL)
2448 return (B_FALSE);
2449 }
2450
2451 mutex_enter(&s->mlbs_mtx);
2452
2453 /* Check if we have too many buffers on loan. */
2454 if (s->mlbs_nloaned >= s->mlbs_hiwat1 &&
2455 b->mlb_used < mlxp->mlx_props.mldp_rx_p50_loan_min_size) {
2456 mutex_exit(&s->mlbs_mtx);
2457 return (B_FALSE);
2458 } else if (s->mlbs_nloaned >= s->mlbs_hiwat2) {
2459 mutex_exit(&s->mlbs_mtx);
2460 return (B_FALSE);
2461 }
2462
2463 b->mlb_state = MLXCX_BUFFER_ON_LOAN;
2464 b->mlb_wqe_index = 0;
2465 list_remove(&s->mlbs_busy, b);
2466 list_insert_tail(&s->mlbs_loaned, b);
2467 s->mlbs_nloaned++;
2468 mutex_exit(&s->mlbs_mtx);
2469
2470 return (B_TRUE);
2471 }
2472
2473 void
mlxcx_buf_return_chain(mlxcx_t * mlxp,mlxcx_buffer_t * b0,boolean_t keepmp)2474 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp)
2475 {
2476 mlxcx_buffer_t *b;
2477
2478 if (b0->mlb_tx_head != b0) {
2479 mlxcx_buf_return(mlxp, b0);
2480 return;
2481 }
2482
2483 while ((b = list_head(&b0->mlb_tx_chain)) != NULL) {
2484 mlxcx_buf_return(mlxp, b);
2485 }
2486 if (keepmp) {
2487 b0->mlb_tx_mp = NULL;
2488 b0->mlb_tx_head = NULL;
2489 }
2490 mlxcx_buf_return(mlxp, b0);
2491 }
2492
2493 inline void
mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t * s,int64_t incr)2494 mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr)
2495 {
2496 s->mlbs_ntotal += incr;
2497 s->mlbs_hiwat1 = s->mlbs_ntotal / 2;
2498 s->mlbs_hiwat2 = 3 * (s->mlbs_ntotal / 4);
2499 }
2500
2501 void
mlxcx_buf_return(mlxcx_t * mlxp,mlxcx_buffer_t * b)2502 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2503 {
2504 mlxcx_buffer_state_t oldstate = b->mlb_state;
2505 mlxcx_buffer_t *txhead = b->mlb_tx_head;
2506 mlxcx_buf_shard_t *s = b->mlb_shard;
2507 mblk_t *mp = b->mlb_tx_mp;
2508
2509 VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
2510 ASSERT3P(b->mlb_mlx, ==, mlxp);
2511
2512 /*
2513 * The mlbs_mtx held below is a heavily contended lock, so it is
2514 * imperative we do as much of the buffer clean up outside the lock
2515 * as is possible.
2516 */
2517 b->mlb_state = MLXCX_BUFFER_FREE;
2518 b->mlb_wqe_index = 0;
2519 b->mlb_tx_head = NULL;
2520 b->mlb_tx_mp = NULL;
2521 b->mlb_used = 0;
2522 b->mlb_wqebbs = 0;
2523 ASSERT(list_is_empty(&b->mlb_tx_chain));
2524
2525 if (b->mlb_foreign) {
2526 if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
2527 mlxcx_dma_unbind(mlxp, &b->mlb_dma);
2528 }
2529 }
2530
2531 mutex_enter(&s->mlbs_mtx);
2532 switch (oldstate) {
2533 case MLXCX_BUFFER_INIT:
2534 mlxcx_bufshard_adjust_total(s, 1);
2535 break;
2536 case MLXCX_BUFFER_ON_WQ:
2537 list_remove(&s->mlbs_busy, b);
2538 break;
2539 case MLXCX_BUFFER_ON_LOAN:
2540 ASSERT(!b->mlb_foreign);
2541 --s->mlbs_nloaned;
2542 list_remove(&s->mlbs_loaned, b);
2543 if (s->mlbs_state == MLXCX_SHARD_DRAINING) {
2544 /*
2545 * When we're draining, Eg during mac_stop(),
2546 * we destroy the buffer immediately rather than
2547 * recycling it. Otherwise we risk leaving it
2548 * on the free list and leaking it.
2549 */
2550 list_insert_tail(&s->mlbs_free, b);
2551 mlxcx_buf_destroy(mlxp, b);
2552 /*
2553 * Teardown might be waiting for loaned list to empty.
2554 */
2555 cv_broadcast(&s->mlbs_free_nonempty);
2556 mutex_exit(&s->mlbs_mtx);
2557 return;
2558 }
2559 break;
2560 case MLXCX_BUFFER_FREE:
2561 VERIFY(0);
2562 break;
2563 case MLXCX_BUFFER_ON_CHAIN:
2564 ASSERT(txhead != NULL);
2565 list_remove(&txhead->mlb_tx_chain, b);
2566 list_remove(&s->mlbs_busy, b);
2567 break;
2568 }
2569
2570 list_insert_tail(&s->mlbs_free, b);
2571 cv_broadcast(&s->mlbs_free_nonempty);
2572
2573 mutex_exit(&s->mlbs_mtx);
2574
2575 /*
2576 * For TX chain heads, free the mblk_t after we let go of the lock.
2577 * This might be a borrowed buf that we in turn loaned to MAC, in which
2578 * case calling freemsg() on it will re-enter this very function -- so
2579 * we better not be holding the lock!
2580 */
2581 if (txhead == b)
2582 freemsg(mp);
2583 }
2584
2585 void
mlxcx_buf_destroy(mlxcx_t * mlxp,mlxcx_buffer_t * b)2586 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b)
2587 {
2588 mlxcx_buf_shard_t *s = b->mlb_shard;
2589
2590 VERIFY(b->mlb_state == MLXCX_BUFFER_FREE ||
2591 b->mlb_state == MLXCX_BUFFER_INIT);
2592 ASSERT(mutex_owned(&s->mlbs_mtx));
2593
2594 if (b->mlb_state == MLXCX_BUFFER_FREE) {
2595 list_remove(&s->mlbs_free, b);
2596 mlxcx_bufshard_adjust_total(s, -1);
2597 }
2598
2599 /*
2600 * This is going back to the kmem cache, so it needs to be set up in
2601 * the same way we expect a new buffer to come out (state INIT, other
2602 * fields NULL'd)
2603 */
2604 b->mlb_state = MLXCX_BUFFER_INIT;
2605 b->mlb_shard = NULL;
2606 if (b->mlb_mp != NULL) {
2607 freeb(b->mlb_mp);
2608 ASSERT(b->mlb_mp == NULL);
2609 }
2610 mlxcx_dma_free(&b->mlb_dma);
2611 ASSERT(list_is_empty(&b->mlb_tx_chain));
2612
2613 kmem_cache_free(mlxp->mlx_bufs_cache, b);
2614 }
2615
2616 void
mlxcx_shard_ready(mlxcx_buf_shard_t * s)2617 mlxcx_shard_ready(mlxcx_buf_shard_t *s)
2618 {
2619 mutex_enter(&s->mlbs_mtx);
2620 s->mlbs_state = MLXCX_SHARD_READY;
2621 mutex_exit(&s->mlbs_mtx);
2622 }
2623
2624 void
mlxcx_shard_draining(mlxcx_buf_shard_t * s)2625 mlxcx_shard_draining(mlxcx_buf_shard_t *s)
2626 {
2627 mutex_enter(&s->mlbs_mtx);
2628 s->mlbs_state = MLXCX_SHARD_DRAINING;
2629 cv_broadcast(&s->mlbs_free_nonempty);
2630 mutex_exit(&s->mlbs_mtx);
2631 }
2632