1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright (c) 2021, the University of Queensland
14 * Copyright 2020 RackTop Systems, Inc.
15 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
16 */
17
18 /*
19 * Mellanox Connect-X 4/5/6 driver.
20 */
21
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/disp.h>
27 #include <sys/sdt.h>
28
29 #include <sys/mac_provider.h>
30
31 #include <mlxcx.h>
32
33 /*
34 * CTASSERT(s) to cover bad values which would induce bugs.
35 */
36 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
37
38 /*
39 * Disable interrupts.
40 * The act of calling ddi_intr_disable() does not guarantee an interrupt
41 * routine is not running, so flag the vector as quiescing and wait
42 * for anything active to finish.
43 */
44 void
mlxcx_intr_disable(mlxcx_t * mlxp)45 mlxcx_intr_disable(mlxcx_t *mlxp)
46 {
47 int i;
48
49 mlxcx_cmd_eq_disable(mlxp);
50
51 for (i = 0; i < mlxp->mlx_intr_count; ++i) {
52 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
53
54 mutex_enter(&mleq->mleq_mtx);
55
56 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
57 mutex_exit(&mleq->mleq_mtx);
58 continue;
59 }
60
61 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
62
63 mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE;
64 while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0)
65 cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx);
66
67 mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED;
68
69 mutex_exit(&mleq->mleq_mtx);
70 }
71 }
72
73 void
mlxcx_intr_teardown(mlxcx_t * mlxp)74 mlxcx_intr_teardown(mlxcx_t *mlxp)
75 {
76 int i;
77 int ret;
78
79 for (i = 0; i < mlxp->mlx_intr_count; ++i) {
80 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
81
82 mutex_enter(&mleq->mleq_mtx);
83 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
84 if (mleq->mleq_state & MLXCX_EQ_CREATED)
85 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
86 if (i >= mlxp->mlx_intr_cq0) {
87 VERIFY(avl_is_empty(&mleq->mleq_cqs));
88 avl_destroy(&mleq->mleq_cqs);
89 }
90 mutex_exit(&mleq->mleq_mtx);
91 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
92 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
93 if (ret != DDI_SUCCESS) {
94 mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
95 i, ret);
96 }
97 mutex_destroy(&mleq->mleq_mtx);
98 cv_destroy(&mleq->mleq_cv);
99 }
100 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
101 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
102 mlxp->mlx_intr_handles = NULL;
103 mlxp->mlx_eqs = NULL;
104 }
105
106 /*
107 * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
108 */
109 static mlxcx_eventq_ent_t *
mlxcx_eq_next(mlxcx_event_queue_t * mleq)110 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
111 {
112 mlxcx_eventq_ent_t *ent;
113 ddi_fm_error_t err;
114 uint_t ci;
115 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
116
117 /*
118 * This should only be called from interrupt context to ensure
119 * correctness of mleq_cc.
120 */
121 ASSERT(servicing_interrupt());
122 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
123 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
124
125 /* mleq_nents is always a power of 2 */
126 ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
127
128 ent = &mleq->mleq_ent[ci];
129 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
130 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
131 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
132 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
133 DDI_FME_VERSION);
134 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
135 /* The PRM says we have to membar here, so we're doing it */
136 membar_consumer();
137 ++mleq->mleq_cc;
138 return (ent);
139 }
140 /*
141 * In the case of a DMA error, we should re-arm this EQ and then come
142 * back and try again when the device wakes us back up.
143 *
144 * Hopefully the fault will be gone by then.
145 */
146 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
147
148 return (NULL);
149 }
150
151 void
mlxcx_arm_eq(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)152 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
153 {
154 uint_t try = 0;
155 ddi_fm_error_t err;
156 bits32_t v = new_bits32();
157
158 /*
159 * This is only called during initialization when the EQ is
160 * armed for the first time, and when re-armed at the end of
161 * interrupt processing.
162 */
163 ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt());
164 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
165 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
166 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
167 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
168
169 mleq->mleq_state |= MLXCX_EQ_ARMED;
170 mleq->mleq_cc_armed = mleq->mleq_cc;
171
172 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
173 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
174
175 retry:
176 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
177 from_bits32(v));
178 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
179 DDI_FME_VERSION);
180 if (err.fme_status == DDI_FM_OK)
181 return;
182 if (try++ < mlxcx_doorbell_tries) {
183 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
184 goto retry;
185 }
186 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
187 }
188
189 static void
mlxcx_update_eq(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)190 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
191 {
192 bits32_t v = new_bits32();
193 ddi_fm_error_t err;
194
195 /*
196 * This should only be called from interrupt context to ensure
197 * correctness of mleq_cc.
198 */
199 ASSERT(servicing_interrupt());
200 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
201 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
202 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
203
204 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
205 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
206
207 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
208 from_bits32(v));
209 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
210 DDI_FME_VERSION);
211 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
212 /*
213 * Ignore the error, if it's still happening when we try to re-arm the
214 * EQ, we will note the impact then.
215 */
216 }
217
218 static mlxcx_completionq_ent_t *
mlxcx_cq_next(mlxcx_completion_queue_t * mlcq)219 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
220 {
221 mlxcx_completionq_ent_t *ent;
222 ddi_fm_error_t err;
223 uint_t ci;
224 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
225
226 ASSERT(mutex_owned(&mlcq->mlcq_mtx));
227 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
228 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
229
230 /* mlcq_nents is always a power of 2 */
231 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
232
233 ent = &mlcq->mlcq_ent[ci];
234 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
235 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
236 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
237 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
238 DDI_FME_VERSION);
239 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
240 /* The PRM says we have to membar here, so we're doing it */
241 membar_consumer();
242 ++mlcq->mlcq_cc;
243 return (ent);
244 }
245 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
246
247 return (NULL);
248 }
249
250 void
mlxcx_update_cqci(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq)251 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
252 {
253 ddi_fm_error_t err;
254 uint_t try = 0;
255
256 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
257
258 retry:
259 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
260 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
261 DDI_FME_VERSION);
262 if (err.fme_status != DDI_FM_OK) {
263 if (try++ < mlxcx_doorbell_tries) {
264 ddi_fm_dma_err_clear(
265 mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
266 DDI_FME_VERSION);
267 goto retry;
268 } else {
269 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
270 return;
271 }
272 }
273 }
274
275 void
mlxcx_arm_cq(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq)276 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
277 {
278 bits32_t dbval = new_bits32();
279 uint64_t udbval;
280 ddi_fm_error_t err;
281 uint_t try = 0;
282
283 ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx));
284 ASSERT(mutex_owned(&mlcq->mlcq_mtx));
285 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
286 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
287
288 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) {
289 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
290 }
291
292 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
293 return;
294
295 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED);
296 mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
297 mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
298
299 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
300 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
301
302 udbval = (uint64_t)from_bits32(dbval) << 32;
303 udbval |= mlcq->mlcq_num & 0xffffff;
304
305 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
306 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
307
308 retry:
309 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
310 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
311 DDI_FME_VERSION);
312 if (err.fme_status != DDI_FM_OK) {
313 if (try++ < mlxcx_doorbell_tries) {
314 ddi_fm_dma_err_clear(
315 mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
316 DDI_FME_VERSION);
317 goto retry;
318 } else {
319 goto err;
320 }
321 }
322
323 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
324 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
325 DDI_FME_VERSION);
326 if (err.fme_status == DDI_FM_OK)
327 return;
328 if (try++ < mlxcx_doorbell_tries) {
329 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
330 goto retry;
331 }
332
333 err:
334 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
335 }
336
337 const char *
mlxcx_event_name(mlxcx_event_t evt)338 mlxcx_event_name(mlxcx_event_t evt)
339 {
340 switch (evt) {
341 case MLXCX_EVENT_COMPLETION:
342 return ("COMPLETION");
343 case MLXCX_EVENT_PATH_MIGRATED:
344 return ("PATH_MIGRATED");
345 case MLXCX_EVENT_COMM_ESTABLISH:
346 return ("COMM_ESTABLISH");
347 case MLXCX_EVENT_SENDQ_DRAIN:
348 return ("SENDQ_DRAIN");
349 case MLXCX_EVENT_LAST_WQE:
350 return ("LAST_WQE");
351 case MLXCX_EVENT_SRQ_LIMIT:
352 return ("SRQ_LIMIT");
353 case MLXCX_EVENT_DCT_ALL_CLOSED:
354 return ("DCT_ALL_CLOSED");
355 case MLXCX_EVENT_DCT_ACCKEY_VIOL:
356 return ("DCT_ACCKEY_VIOL");
357 case MLXCX_EVENT_CQ_ERROR:
358 return ("CQ_ERROR");
359 case MLXCX_EVENT_WQ_CATASTROPHE:
360 return ("WQ_CATASTROPHE");
361 case MLXCX_EVENT_PATH_MIGRATE_FAIL:
362 return ("PATH_MIGRATE_FAIL");
363 case MLXCX_EVENT_PAGE_FAULT:
364 return ("PAGE_FAULT");
365 case MLXCX_EVENT_WQ_INVALID_REQ:
366 return ("WQ_INVALID_REQ");
367 case MLXCX_EVENT_WQ_ACCESS_VIOL:
368 return ("WQ_ACCESS_VIOL");
369 case MLXCX_EVENT_SRQ_CATASTROPHE:
370 return ("SRQ_CATASTROPHE");
371 case MLXCX_EVENT_INTERNAL_ERROR:
372 return ("INTERNAL_ERROR");
373 case MLXCX_EVENT_PORT_STATE:
374 return ("PORT_STATE");
375 case MLXCX_EVENT_GPIO:
376 return ("GPIO");
377 case MLXCX_EVENT_PORT_MODULE:
378 return ("PORT_MODULE");
379 case MLXCX_EVENT_TEMP_WARNING:
380 return ("TEMP_WARNING");
381 case MLXCX_EVENT_REMOTE_CONFIG:
382 return ("REMOTE_CONFIG");
383 case MLXCX_EVENT_DCBX_CHANGE:
384 return ("DCBX_CHANGE");
385 case MLXCX_EVENT_DOORBELL_CONGEST:
386 return ("DOORBELL_CONGEST");
387 case MLXCX_EVENT_STALL_VL:
388 return ("STALL_VL");
389 case MLXCX_EVENT_CMD_COMPLETION:
390 return ("CMD_COMPLETION");
391 case MLXCX_EVENT_PAGE_REQUEST:
392 return ("PAGE_REQUEST");
393 case MLXCX_EVENT_NIC_VPORT:
394 return ("NIC_VPORT");
395 case MLXCX_EVENT_EC_PARAMS_CHANGE:
396 return ("EC_PARAMS_CHANGE");
397 case MLXCX_EVENT_XRQ_ERROR:
398 return ("XRQ_ERROR");
399 }
400 return ("UNKNOWN");
401 }
402
403 /* Should be called only when link state has changed. */
404 void
mlxcx_update_link_state(mlxcx_t * mlxp,mlxcx_port_t * port)405 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
406 {
407 link_state_t ls;
408
409 mutex_enter(&port->mlp_mtx);
410 (void) mlxcx_cmd_query_port_status(mlxp, port);
411 (void) mlxcx_cmd_query_port_speed(mlxp, port);
412 (void) mlxcx_cmd_query_port_fec(mlxp, port);
413
414 switch (port->mlp_oper_status) {
415 case MLXCX_PORT_STATUS_UP:
416 case MLXCX_PORT_STATUS_UP_ONCE:
417 ls = LINK_STATE_UP;
418 break;
419 case MLXCX_PORT_STATUS_DOWN:
420 ls = LINK_STATE_DOWN;
421 break;
422 default:
423 ls = LINK_STATE_UNKNOWN;
424 }
425
426 if (mlxp->mlx_mac_hdl != NULL)
427 mac_link_update(mlxp->mlx_mac_hdl, ls);
428
429 mutex_exit(&port->mlp_mtx);
430 }
431
432 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX);
433
434 static void
mlxcx_give_pages_once(mlxcx_t * mlxp,size_t npages)435 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
436 {
437 ddi_device_acc_attr_t acc;
438 ddi_dma_attr_t attr;
439 mlxcx_dev_page_t *mdp;
440 mlxcx_dev_page_t **pages;
441 size_t i;
442 const ddi_dma_cookie_t *ck;
443
444 /*
445 * If this isn't enough, the HCA will ask for more
446 */
447 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
448
449 pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP);
450
451 for (i = 0; i < npages; i++) {
452 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
453 mlxcx_dma_acc_attr(mlxp, &acc);
454 mlxcx_dma_page_attr(mlxp, &attr);
455 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
456 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
457 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i,
458 npages);
459 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
460 goto cleanup_npages;
461 }
462 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
463 mdp->mxdp_pa = ck->dmac_laddress;
464 pages[i] = mdp;
465 }
466
467 mutex_enter(&mlxp->mlx_pagemtx);
468
469 if (!mlxcx_cmd_give_pages(mlxp,
470 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
471 mlxcx_warn(mlxp, "!hardware refused our gift of %lu "
472 "pages!", npages);
473 mutex_exit(&mlxp->mlx_pagemtx);
474 goto cleanup_npages;
475 }
476
477 for (i = 0; i < npages; i++) {
478 avl_add(&mlxp->mlx_pages, pages[i]);
479 }
480 mlxp->mlx_npages += npages;
481 mutex_exit(&mlxp->mlx_pagemtx);
482
483 kmem_free(pages, sizeof (*pages) * npages);
484
485 return;
486
487 cleanup_npages:
488 for (i = 0; i < npages; i++) {
489 if ((mdp = pages[i]) == NULL)
490 break;
491
492 mlxcx_dma_free(&mdp->mxdp_dma);
493 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
494 }
495 /* Tell the hardware we had an allocation failure. */
496 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
497 0, NULL);
498 mutex_exit(&mlxp->mlx_pagemtx);
499
500 kmem_free(pages, sizeof (*pages) * npages);
501 }
502
503 static void
mlxcx_take_pages_once(mlxcx_t * mlxp,size_t npages)504 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
505 {
506 uint_t i;
507 int32_t ret;
508 uint64_t *pas;
509 mlxcx_dev_page_t *mdp, probe;
510
511 pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP);
512
513 if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) {
514 kmem_free(pas, sizeof (*pas) * npages);
515 return;
516 }
517
518 mutex_enter(&mlxp->mlx_pagemtx);
519
520 ASSERT0(avl_is_empty(&mlxp->mlx_pages));
521
522 for (i = 0; i < ret; i++) {
523 bzero(&probe, sizeof (probe));
524 probe.mxdp_pa = pas[i];
525
526 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
527
528 if (mdp != NULL) {
529 avl_remove(&mlxp->mlx_pages, mdp);
530 mlxp->mlx_npages--;
531 mlxcx_dma_free(&mdp->mxdp_dma);
532 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
533 } else {
534 mlxcx_warn(mlxp, "hardware returned a page "
535 "with PA 0x%" PRIx64 " but we have no "
536 "record of giving out such a page", pas[i]);
537 }
538 }
539
540 mutex_exit(&mlxp->mlx_pagemtx);
541
542 kmem_free(pas, sizeof (*pas) * npages);
543 }
544
545 static void
mlxcx_pages_task(void * arg)546 mlxcx_pages_task(void *arg)
547 {
548 mlxcx_async_param_t *param = arg;
549 mlxcx_t *mlxp = param->mla_mlx;
550 int32_t npages;
551
552 /*
553 * We can drop the pending status now, as we've extracted what
554 * is needed to process the pages request.
555 *
556 * Even though we should never get another pages request until
557 * we have responded to this, along with the guard in mlxcx_sync_intr,
558 * this safely allows the reuse of mlxcx_async_param_t.
559 */
560 mutex_enter(¶m->mla_mtx);
561 npages = param->mla_pages.mlp_npages;
562 param->mla_pending = B_FALSE;
563 bzero(¶m->mla_pages, sizeof (param->mla_pages));
564 mutex_exit(¶m->mla_mtx);
565
566 /*
567 * The PRM describes npages as: "Number of missing / unneeded pages
568 * (signed number, msb indicate sign)". The implication is that
569 * it will not be zero. We are expected to use this to give or
570 * take back pages (based on the sign) using the MANAGE_PAGES
571 * command but we can't determine whether to give or take
572 * when npages is zero. So we do nothing.
573 */
574 if (npages > 0) {
575 mlxcx_give_pages_once(mlxp, npages);
576 } else if (npages < 0) {
577 mlxcx_take_pages_once(mlxp, -1 * npages);
578 }
579 }
580
581 static void
mlxcx_link_state_task(void * arg)582 mlxcx_link_state_task(void *arg)
583 {
584 mlxcx_async_param_t *param = arg;
585 mlxcx_port_t *port;
586 mlxcx_t *mlxp;
587
588 /*
589 * Gather the argruments from the parameters and clear the
590 * pending status.
591 *
592 * The pending status must be cleared *before* we update the
593 * link state. This is both safe and required to ensure we always
594 * have the correct link state. It is safe because taskq_ents are
595 * reusable (by the caller of taskq_dispatch_ent()) once the
596 * task function has started executing. It is necessarily before
597 * updating the link state to guarantee further link state change
598 * events are not missed and we always have the current link state.
599 */
600 mutex_enter(¶m->mla_mtx);
601 mlxp = param->mla_mlx;
602 port = param->mla_port;
603 param->mla_pending = B_FALSE;
604 mutex_exit(¶m->mla_mtx);
605
606 mlxcx_update_link_state(mlxp, port);
607 }
608
609 static const char *
mlxcx_module_error_string(mlxcx_module_error_type_t err)610 mlxcx_module_error_string(mlxcx_module_error_type_t err)
611 {
612 switch (err) {
613 case MLXCX_MODULE_ERR_POWER_BUDGET:
614 return ("POWER_BUDGET");
615 case MLXCX_MODULE_ERR_LONG_RANGE:
616 return ("LONG_RANGE");
617 case MLXCX_MODULE_ERR_BUS_STUCK:
618 return ("BUS_STUCK");
619 case MLXCX_MODULE_ERR_NO_EEPROM:
620 return ("NO_EEPROM");
621 case MLXCX_MODULE_ERR_ENFORCEMENT:
622 return ("ENFORCEMENT");
623 case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
624 return ("UNKNOWN_IDENT");
625 case MLXCX_MODULE_ERR_HIGH_TEMP:
626 return ("HIGH_TEMP");
627 case MLXCX_MODULE_ERR_CABLE_SHORTED:
628 return ("CABLE_SHORTED");
629 default:
630 return ("UNKNOWN");
631 }
632 }
633
634 static void
mlxcx_report_module_error(mlxcx_t * mlxp,mlxcx_evdata_port_mod_t * evd)635 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
636 {
637 uint64_t ena;
638 char buf[FM_MAX_CLASS];
639 const char *lename;
640 const char *ename;
641 const char *stname;
642 uint_t eno = 0;
643 mlxcx_module_status_t state = evd->mled_port_mod_module_status;
644
645 switch (state) {
646 case MLXCX_MODULE_ERROR:
647 stname = "error";
648 eno = evd->mled_port_mod_error_type;
649 lename = mlxcx_module_error_string(eno);
650 switch (eno) {
651 case MLXCX_MODULE_ERR_ENFORCEMENT:
652 ename = DDI_FM_TXR_ERROR_WHITELIST;
653 break;
654 case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
655 case MLXCX_MODULE_ERR_NO_EEPROM:
656 ename = DDI_FM_TXR_ERROR_NOTSUPP;
657 break;
658 case MLXCX_MODULE_ERR_HIGH_TEMP:
659 ename = DDI_FM_TXR_ERROR_OVERTEMP;
660 break;
661 case MLXCX_MODULE_ERR_POWER_BUDGET:
662 case MLXCX_MODULE_ERR_LONG_RANGE:
663 case MLXCX_MODULE_ERR_CABLE_SHORTED:
664 ename = DDI_FM_TXR_ERROR_HWFAIL;
665 break;
666 case MLXCX_MODULE_ERR_BUS_STUCK:
667 default:
668 ename = DDI_FM_TXR_ERROR_UNKNOWN;
669 }
670 break;
671 default:
672 return;
673 }
674
675 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
676 DDI_FM_NIC, DDI_FM_TXR_ERROR);
677 ena = fm_ena_generate(0, FM_ENA_FMT1);
678 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
679 return;
680
681 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
682 /* compulsory FM props */
683 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
684 /* generic NIC txr error event props */
685 "error", DATA_TYPE_STRING, ename,
686 "port_index", DATA_TYPE_UINT8, 0,
687 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
688 /* local props */
689 "mlxcx_state", DATA_TYPE_STRING, stname,
690 "mlxcx_error", DATA_TYPE_STRING, lename,
691 "mlxcx_error_num", DATA_TYPE_UINT8, eno,
692 NULL);
693 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
694 }
695
696 /*
697 * Common beginning of interrupt processing.
698 * Confirm interrupt hasn't been disabled, verify its state and
699 * mark the vector as active.
700 */
701 static boolean_t
mlxcx_intr_ini(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)702 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
703 {
704 mutex_enter(&mleq->mleq_mtx);
705
706 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
707 mutex_exit(&mleq->mleq_mtx);
708 return (B_FALSE);
709 }
710
711 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
712 !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
713 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
714 mlxcx_warn(mlxp, "intr %d in bad eq state",
715 mleq->mleq_intr_index);
716 mutex_exit(&mleq->mleq_mtx);
717 return (B_FALSE);
718 }
719
720 mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE;
721 mutex_exit(&mleq->mleq_mtx);
722
723 return (B_TRUE);
724 }
725
726 /*
727 * End of interrupt processing.
728 * Mark vector as no longer active and if shutdown is blocked on this vector,
729 * wake it up.
730 */
731 static void
mlxcx_intr_fini(mlxcx_event_queue_t * mleq)732 mlxcx_intr_fini(mlxcx_event_queue_t *mleq)
733 {
734 mutex_enter(&mleq->mleq_mtx);
735 if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0)
736 cv_signal(&mleq->mleq_cv);
737
738 mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE;
739 mutex_exit(&mleq->mleq_mtx);
740 }
741
742 static uint_t
mlxcx_intr_async(caddr_t arg,caddr_t arg2)743 mlxcx_intr_async(caddr_t arg, caddr_t arg2)
744 {
745 mlxcx_t *mlxp = (mlxcx_t *)arg;
746 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
747 mlxcx_eventq_ent_t *ent;
748 mlxcx_async_param_t *param;
749 uint_t portn;
750 uint16_t func;
751
752 if (!mlxcx_intr_ini(mlxp, mleq))
753 return (DDI_INTR_CLAIMED);
754
755 ent = mlxcx_eq_next(mleq);
756 if (ent == NULL) {
757 goto done;
758 }
759
760 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
761 mleq->mleq_state &= ~MLXCX_EQ_ARMED;
762
763 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
764 DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *,
765 ent);
766
767 /*
768 * Handle events which can be processed while we're still in
769 * mlxcx_attach(). Everything on the mlxcx_t which these events
770 * use must be allocated and set up prior to the call to
771 * mlxcx_setup_async_eqs().
772 */
773 switch (ent->mleqe_event_type) {
774 case MLXCX_EVENT_CMD_COMPLETION:
775 mlxcx_cmd_completion(mlxp, ent);
776 continue;
777 case MLXCX_EVENT_PAGE_REQUEST:
778 func = from_be16(ent->mleqe_page_request.
779 mled_page_request_function_id);
780 VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX);
781
782 param = &mlxp->mlx_npages_req[func];
783 mutex_enter(¶m->mla_mtx);
784 if (param->mla_pending) {
785 /*
786 * The PRM states we will not get another
787 * page request event until any pending have
788 * been posted as complete to the HCA.
789 * This will guard against this anyway.
790 */
791 mutex_exit(¶m->mla_mtx);
792 mlxcx_warn(mlxp, "Unexpected page request "
793 "whilst another is pending");
794 continue;
795 }
796 param->mla_pages.mlp_npages =
797 (int32_t)from_be32(ent->mleqe_page_request.
798 mled_page_request_num_pages);
799 param->mla_pages.mlp_func = func;
800 param->mla_pending = B_TRUE;
801 ASSERT3P(param->mla_mlx, ==, mlxp);
802 mutex_exit(¶m->mla_mtx);
803
804 taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task,
805 param, 0, ¶m->mla_tqe);
806 continue;
807 }
808
809 /*
810 * All other events should be ignored while in attach.
811 */
812 mutex_enter(&mleq->mleq_mtx);
813 if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
814 mutex_exit(&mleq->mleq_mtx);
815 continue;
816 }
817 mutex_exit(&mleq->mleq_mtx);
818
819 switch (ent->mleqe_event_type) {
820 case MLXCX_EVENT_PORT_STATE:
821 portn = get_bits8(
822 ent->mleqe_port_state.mled_port_state_port_num,
823 MLXCX_EVENT_PORT_NUM) - 1;
824 if (portn >= mlxp->mlx_nports)
825 break;
826
827 param = &mlxp->mlx_ports[portn].mlx_port_event;
828 mutex_enter(¶m->mla_mtx);
829 if (param->mla_pending) {
830 /*
831 * There is a link state event pending
832 * processing. When that event is handled
833 * it will get the current link state.
834 */
835 mutex_exit(¶m->mla_mtx);
836 break;
837 }
838
839 ASSERT3P(param->mla_mlx, ==, mlxp);
840 ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]);
841
842 param->mla_pending = B_TRUE;
843 mutex_exit(¶m->mla_mtx);
844
845 taskq_dispatch_ent(mlxp->mlx_async_tq,
846 mlxcx_link_state_task, param, 0, ¶m->mla_tqe);
847 break;
848 case MLXCX_EVENT_PORT_MODULE:
849 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
850 break;
851 default:
852 mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d",
853 ent->mleqe_event_type, mleq->mleq_intr_index);
854 }
855 }
856
857 mlxcx_arm_eq(mlxp, mleq);
858
859 done:
860 mlxcx_intr_fini(mleq);
861 return (DDI_INTR_CLAIMED);
862 }
863
864 static boolean_t
mlxcx_process_cq(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,mblk_t ** mpp,size_t bytelim)865 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
866 size_t bytelim)
867 {
868 mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
869 mlxcx_completionq_ent_t *cent;
870 mblk_t *mp, *cmp, *nmp;
871 mlxcx_buffer_t *buf;
872 boolean_t found, added;
873 size_t bytes = 0;
874 uint_t rx_frames = 0;
875 uint_t comp_cnt = 0;
876 int64_t wqebbs, bufcnt;
877
878 *mpp = NULL;
879
880 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
881 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
882 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
883 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
884 return (B_FALSE);
885 }
886
887 nmp = cmp = mp = NULL;
888
889 wqebbs = 0;
890 bufcnt = 0;
891 for (cent = mlxcx_cq_next(mlcq); cent != NULL;
892 cent = mlxcx_cq_next(mlcq)) {
893 /*
894 * Teardown and ring stop can atomic_or this flag
895 * into our state if they want us to stop early.
896 */
897 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
898 return (B_FALSE);
899
900 comp_cnt++;
901 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
902 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
903 /* NOP */
904 atomic_dec_64(&wq->mlwq_wqebb_used);
905 goto nextcq;
906 }
907
908 lookagain:
909 /*
910 * Generally the buffer we're looking for will be
911 * at the front of the list, so this loop won't
912 * need to look far.
913 */
914 buf = list_head(&mlcq->mlcq_buffers);
915 found = B_FALSE;
916 while (buf != NULL) {
917 if ((buf->mlb_wqe_index & UINT16_MAX) ==
918 from_be16(cent->mlcqe_wqe_counter)) {
919 found = B_TRUE;
920 break;
921 }
922 buf = list_next(&mlcq->mlcq_buffers, buf);
923 }
924
925 if (!found) {
926 /*
927 * If there's any buffers waiting on the
928 * buffers_b list, then merge those into
929 * the main list and have another look.
930 *
931 * The wq enqueue routines push new buffers
932 * into buffers_b so that they can avoid
933 * taking the mlcq_mtx and blocking us for
934 * every single packet.
935 */
936 added = B_FALSE;
937 mutex_enter(&mlcq->mlcq_bufbmtx);
938 if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
939 list_move_tail(&mlcq->mlcq_buffers,
940 &mlcq->mlcq_buffers_b);
941 added = B_TRUE;
942 }
943 mutex_exit(&mlcq->mlcq_bufbmtx);
944 if (added)
945 goto lookagain;
946
947 /*
948 * This check could go just after the lookagain
949 * label, but it is a hot code path so we don't
950 * want to unnecessarily grab a lock and check
951 * a flag for a relatively rare event (the ring
952 * being stopped).
953 */
954 mutex_enter(&wq->mlwq_mtx);
955 if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
956 mutex_exit(&wq->mlwq_mtx);
957 goto nextcq;
958 }
959 mutex_exit(&wq->mlwq_mtx);
960
961 buf = list_head(&mlcq->mlcq_buffers);
962 mlxcx_warn(mlxp, "got completion on CQ %x but "
963 "no buffer matching wqe found: %x (first "
964 "buffer counter = %x)", mlcq->mlcq_num,
965 from_be16(cent->mlcqe_wqe_counter),
966 buf == NULL ? UINT32_MAX :
967 buf->mlb_wqe_index);
968 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
969 goto nextcq;
970 }
971
972 /*
973 * The buf is likely to be freed below, count this now.
974 */
975 wqebbs += buf->mlb_wqebbs;
976
977 list_remove(&mlcq->mlcq_buffers, buf);
978 bufcnt++;
979
980 switch (mlcq->mlcq_wq->mlwq_type) {
981 case MLXCX_WQ_TYPE_SENDQ:
982 mlxcx_tx_completion(mlxp, mlcq, cent, buf);
983 break;
984 case MLXCX_WQ_TYPE_RECVQ:
985 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
986 bytes += from_be32(cent->mlcqe_byte_cnt);
987 if (nmp != NULL) {
988 if (cmp != NULL) {
989 cmp->b_next = nmp;
990 cmp = nmp;
991 } else {
992 mp = cmp = nmp;
993 }
994
995 rx_frames++;
996 }
997 break;
998 }
999
1000 /*
1001 * Update the consumer index with what has been processed,
1002 * followed by driver counters. It is important to tell the
1003 * hardware first, otherwise when we throw more packets at
1004 * it, it may get an overflow error.
1005 * We do this whenever we've processed enough to bridge the
1006 * high->low water mark.
1007 */
1008 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
1009 mlxcx_update_cqci(mlxp, mlcq);
1010 /*
1011 * Both these variables are incremented using
1012 * atomics as they are modified in other code paths
1013 * (Eg during tx) which hold different locks.
1014 */
1015 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1016 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1017 wqebbs = 0;
1018 bufcnt = 0;
1019 comp_cnt = 0;
1020 }
1021 nextcq:
1022 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
1023 (bytelim != 0 && bytes > bytelim))
1024 break;
1025 }
1026
1027 if (comp_cnt > 0) {
1028 mlxcx_update_cqci(mlxp, mlcq);
1029 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1030 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1031 }
1032
1033 *mpp = mp;
1034 return (B_TRUE);
1035 }
1036
1037
1038 mblk_t *
mlxcx_rx_poll(mlxcx_t * mlxp,mlxcx_completion_queue_t * mlcq,size_t bytelim)1039 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
1040 {
1041 mblk_t *mp = NULL;
1042
1043 ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1044
1045 ASSERT(mlcq->mlcq_wq != NULL);
1046 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
1047
1048 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
1049
1050 return (mp);
1051 }
1052
1053 static uint_t
mlxcx_intr_n(caddr_t arg,caddr_t arg2)1054 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
1055 {
1056 mlxcx_t *mlxp = (mlxcx_t *)arg;
1057 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
1058 mlxcx_eventq_ent_t *ent;
1059 mlxcx_completion_queue_t *mlcq, probe;
1060 mlxcx_work_queue_t *mlwq;
1061 mblk_t *mp = NULL;
1062 boolean_t tellmac = B_FALSE;
1063
1064 if (!mlxcx_intr_ini(mlxp, mleq))
1065 return (DDI_INTR_CLAIMED);
1066
1067 ent = mlxcx_eq_next(mleq);
1068 if (ent == NULL) {
1069 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
1070 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
1071 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1072 (void) ddi_intr_disable(mlxp->mlx_intr_handles[
1073 mleq->mleq_intr_index]);
1074 }
1075 goto done;
1076 }
1077 mleq->mleq_badintrs = 0;
1078
1079 mutex_enter(&mleq->mleq_mtx);
1080 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
1081 mleq->mleq_state &= ~MLXCX_EQ_ARMED;
1082 #if defined(DEBUG)
1083 /*
1084 * If we're still in mlxcx_attach and an intr_n fired, something really
1085 * weird is going on. This shouldn't happen in the absence of a driver
1086 * or firmware bug, so in the interests of minimizing branches in this
1087 * function this check is under DEBUG.
1088 */
1089 if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
1090 mutex_exit(&mleq->mleq_mtx);
1091 mlxcx_warn(mlxp, "intr_n (%u) fired during attach, disabling "
1092 "vector", mleq->mleq_intr_index);
1093 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
1094 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1095 (void) ddi_intr_disable(mlxp->mlx_intr_handles[
1096 mleq->mleq_intr_index]);
1097 goto done;
1098 }
1099 #endif
1100 mutex_exit(&mleq->mleq_mtx);
1101
1102 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
1103 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
1104
1105 probe.mlcq_num =
1106 from_be24(ent->mleqe_completion.mled_completion_cqn);
1107 mutex_enter(&mleq->mleq_mtx);
1108 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
1109 mutex_exit(&mleq->mleq_mtx);
1110
1111 if (mlcq == NULL)
1112 goto update_eq;
1113
1114 mlwq = mlcq->mlcq_wq;
1115
1116 /*
1117 * mlcq_arm_mtx is used to avoid race conditions between
1118 * this interrupt routine and the transition from polling
1119 * back to interrupt mode. When exiting poll mode the
1120 * CQ is likely to be un-armed, which means there will
1121 * be no events for the CQ coming though here,
1122 * consequently very low contention on mlcq_arm_mtx.
1123 *
1124 * mlcq_arm_mtx must be released before calls into mac
1125 * layer in order to avoid deadlocks.
1126 */
1127 mutex_enter(&mlcq->mlcq_arm_mtx);
1128 mlcq->mlcq_ec++;
1129 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
1130
1131 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
1132 /*
1133 * If we failed to take the mutex because the
1134 * polling function has it, just move on.
1135 * We don't want to block other CQs behind
1136 * this one.
1137 */
1138 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) {
1139 mutex_exit(&mlcq->mlcq_arm_mtx);
1140 goto update_eq;
1141 }
1142
1143 /* Otherwise we will wait. */
1144 mutex_enter(&mlcq->mlcq_mtx);
1145 }
1146
1147 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
1148 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
1149 /*
1150 * The ring is not in polling mode and we processed
1151 * some completion queue entries.
1152 */
1153 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
1154 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
1155 atomic_and_uint(&mlcq->mlcq_state,
1156 ~MLXCX_CQ_BLOCKED_MAC);
1157 tellmac = B_TRUE;
1158 }
1159
1160 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
1161 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
1162 atomic_and_uint(&mlwq->mlwq_state,
1163 ~MLXCX_WQ_BLOCKED_MAC);
1164 tellmac = B_TRUE;
1165 }
1166
1167 mlxcx_arm_cq(mlxp, mlcq);
1168
1169 mutex_exit(&mlcq->mlcq_mtx);
1170 mutex_exit(&mlcq->mlcq_arm_mtx);
1171
1172 if (tellmac) {
1173 mac_tx_ring_update(mlxp->mlx_mac_hdl,
1174 mlcq->mlcq_mac_hdl);
1175 tellmac = B_FALSE;
1176 }
1177
1178 if (mp != NULL) {
1179 mac_rx_ring(mlxp->mlx_mac_hdl,
1180 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
1181 }
1182 } else {
1183 mutex_exit(&mlcq->mlcq_mtx);
1184 mutex_exit(&mlcq->mlcq_arm_mtx);
1185 }
1186
1187 update_eq:
1188 /*
1189 * Updating the consumer counter for an EQ requires a write
1190 * to the UAR, which is possibly expensive.
1191 *
1192 * Try to do it only often enough to stop us wrapping around.
1193 */
1194 if ((mleq->mleq_cc & 0x7) == 0)
1195 mlxcx_update_eq(mlxp, mleq);
1196 }
1197
1198 mlxcx_arm_eq(mlxp, mleq);
1199
1200 done:
1201 mlxcx_intr_fini(mleq);
1202 return (DDI_INTR_CLAIMED);
1203 }
1204
1205 boolean_t
mlxcx_intr_setup(mlxcx_t * mlxp)1206 mlxcx_intr_setup(mlxcx_t *mlxp)
1207 {
1208 dev_info_t *dip = mlxp->mlx_dip;
1209 int ret;
1210 int nintrs = 0;
1211 int navail = 0;
1212 int types, i;
1213 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
1214
1215 ret = ddi_intr_get_supported_types(dip, &types);
1216 if (ret != DDI_SUCCESS) {
1217 mlxcx_warn(mlxp, "Failed to get supported interrupt types");
1218 return (B_FALSE);
1219 }
1220
1221 if (!(types & DDI_INTR_TYPE_MSIX)) {
1222 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
1223 "requires MSI-X");
1224 return (B_FALSE);
1225 }
1226
1227 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
1228 if (ret != DDI_SUCCESS) {
1229 mlxcx_warn(mlxp, "Failed to get number of interrupts");
1230 return (B_FALSE);
1231 }
1232 if (nintrs < 2) {
1233 mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx "
1234 "requires 2", nintrs);
1235 return (B_FALSE);
1236 }
1237
1238 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
1239 if (ret != DDI_SUCCESS) {
1240 mlxcx_warn(mlxp,
1241 "Failed to get number of available interrupts");
1242 return (B_FALSE);
1243 }
1244 if (navail < 2) {
1245 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
1246 "requires 2", navail);
1247 return (B_FALSE);
1248 }
1249
1250 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
1251 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
1252 /*
1253 * Interrupts for Completion Queues events start from vector 1
1254 * up to available vectors. Vector 0 is used for asynchronous
1255 * events.
1256 */
1257 mlxp->mlx_intr_cq0 = 1;
1258
1259 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
1260 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
1261 if (ret != DDI_SUCCESS) {
1262 mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
1263 mlxcx_intr_teardown(mlxp);
1264 return (B_FALSE);
1265 }
1266 if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
1267 mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
1268 "requires %d", mlxp->mlx_intr_count,
1269 mlxp->mlx_intr_cq0 + 1);
1270 mlxcx_intr_teardown(mlxp);
1271 return (B_FALSE);
1272 }
1273 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
1274
1275 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
1276 if (ret != DDI_SUCCESS) {
1277 mlxcx_warn(mlxp, "Failed to get interrupt priority");
1278 mlxcx_intr_teardown(mlxp);
1279 return (B_FALSE);
1280 }
1281
1282 /*
1283 * Set the interrupt priority for the asynchronous handler higher
1284 * than the ring handlers. Some operations which issue commands,
1285 * and thus rely on the async interrupt handler for posting
1286 * completion, do so with a CQ mutex held. The CQ mutex is also
1287 * acquired during ring processing, so if the ring processing vector
1288 * happens to be assigned to the same CPU as the async vector
1289 * it can hold off the async interrupt thread and lead to a deadlock.
1290 * By assigning a higher priority to the async vector, it will
1291 * always be dispatched.
1292 */
1293 mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
1294 if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
1295 mlxp->mlx_async_intr_pri++;
1296 } else {
1297 mlxp->mlx_intr_pri--;
1298 }
1299
1300 mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1301 sizeof (mlxcx_event_queue_t);
1302 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1303
1304 /*
1305 * In the failure path, mlxcx_intr_teardown() expects this
1306 * mutex and avl tree to be init'ed - so do it now.
1307 */
1308 for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1309 uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
1310 mlxp->mlx_intr_pri;
1311
1312 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1313 DDI_INTR_PRI(pri));
1314 cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);
1315
1316 if (i < mlxp->mlx_intr_cq0)
1317 continue;
1318
1319 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1320 sizeof (mlxcx_completion_queue_t),
1321 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1322 }
1323
1324 while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) {
1325 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
1326 mlxp->mlx_async_intr_pri);
1327 if (ret == DDI_SUCCESS)
1328 break;
1329 mlxcx_note(mlxp,
1330 "!Failed to set interrupt priority to %u for "
1331 "async interrupt vector", mlxp->mlx_async_intr_pri);
1332 /*
1333 * If it was not possible to set the IPL for the async
1334 * interrupt to the desired value, then try a lower priority.
1335 * Some PSMs can only accommodate a limited number of vectors
1336 * at eatch priority level (or group of priority levels). Since
1337 * the async priority must be set higher than the ring
1338 * handlers, lower both. The ring handler priority is set
1339 * below.
1340 */
1341 mlxp->mlx_async_intr_pri--;
1342 mlxp->mlx_intr_pri--;
1343 }
1344
1345 if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) {
1346 mlxcx_warn(mlxp, "Failed to find an interrupt priority for "
1347 "async interrupt vector");
1348 mlxcx_intr_teardown(mlxp);
1349 return (B_FALSE);
1350 }
1351
1352 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
1353 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1354 if (ret != DDI_SUCCESS) {
1355 mlxcx_warn(mlxp, "Failed to add async interrupt handler");
1356 mlxcx_intr_teardown(mlxp);
1357 return (B_FALSE);
1358 }
1359
1360 /*
1361 * If we have enough interrupts, set their "type" fields so that we
1362 * avoid mixing RX and TX queues on the same EQs.
1363 */
1364 if (mlxp->mlx_intr_count >= 8) {
1365 eqt = MLXCX_EQ_TYPE_RX;
1366 }
1367
1368 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
1369 mlxp->mlx_eqs[i].mleq_intr_index = i;
1370
1371 mlxp->mlx_eqs[i].mleq_type = eqt;
1372 /*
1373 * If eqt is still ANY, just leave it set to that
1374 * (no else here).
1375 */
1376 if (eqt == MLXCX_EQ_TYPE_RX) {
1377 eqt = MLXCX_EQ_TYPE_TX;
1378 } else if (eqt == MLXCX_EQ_TYPE_TX) {
1379 eqt = MLXCX_EQ_TYPE_RX;
1380 }
1381
1382 while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) {
1383 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
1384 mlxp->mlx_intr_pri);
1385 if (ret == DDI_SUCCESS)
1386 break;
1387 mlxcx_note(mlxp, "!Failed to set interrupt priority to "
1388 "%u for interrupt vector %d",
1389 mlxp->mlx_intr_pri, i);
1390 mlxp->mlx_intr_pri--;
1391 }
1392 if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) {
1393 mlxcx_warn(mlxp,
1394 "Failed to find an interrupt priority for "
1395 "interrupt vector %d", i);
1396 mlxcx_intr_teardown(mlxp);
1397 return (B_FALSE);
1398 }
1399
1400 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1401 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1402 if (ret != DDI_SUCCESS) {
1403 mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
1404 i);
1405 mlxcx_intr_teardown(mlxp);
1406 return (B_FALSE);
1407 }
1408 }
1409
1410 return (B_TRUE);
1411 }
1412