xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision 379728489ed47862c4927c75771e767b9476c9c4)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2020, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/disp.h>
27 #include <sys/sdt.h>
28 
29 #include <sys/mac_provider.h>
30 
31 #include <mlxcx.h>
32 
33 /*
34  * CTASSERT(s) to cover bad values which would induce bugs.
35  */
36 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
37 
38 /*
39  * Disable interrupts.
40  * The act of calling ddi_intr_disable() does not guarantee an interrupt
41  * routine is not running, so flag the vector as quiescing and wait
42  * for anything active to finish.
43  */
44 void
45 mlxcx_intr_disable(mlxcx_t *mlxp)
46 {
47 	int i;
48 
49 	mlxcx_cmd_eq_disable(mlxp);
50 
51 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
52 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
53 
54 		mutex_enter(&mleq->mleq_mtx);
55 
56 		if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
57 			mutex_exit(&mleq->mleq_mtx);
58 			continue;
59 		}
60 
61 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
62 
63 		mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE;
64 		while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0)
65 			cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx);
66 
67 		mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED;
68 
69 		mutex_exit(&mleq->mleq_mtx);
70 	}
71 }
72 
73 void
74 mlxcx_intr_teardown(mlxcx_t *mlxp)
75 {
76 	int i;
77 	int ret;
78 
79 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
80 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
81 
82 		mutex_enter(&mleq->mleq_mtx);
83 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
84 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
85 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
86 		if (i >= mlxp->mlx_intr_cq0) {
87 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
88 			avl_destroy(&mleq->mleq_cqs);
89 		}
90 		mutex_exit(&mleq->mleq_mtx);
91 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
92 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
93 		if (ret != DDI_SUCCESS) {
94 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
95 			    i, ret);
96 		}
97 		mutex_destroy(&mleq->mleq_mtx);
98 		cv_destroy(&mleq->mleq_cv);
99 	}
100 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
101 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
102 	mlxp->mlx_intr_handles = NULL;
103 	mlxp->mlx_eqs = NULL;
104 }
105 
106 /*
107  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
108  */
109 static mlxcx_eventq_ent_t *
110 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
111 {
112 	mlxcx_eventq_ent_t *ent;
113 	ddi_fm_error_t err;
114 	uint_t ci;
115 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
116 
117 	/*
118 	 * This should only be called from interrupt context to ensure
119 	 * correctness of mleq_cc.
120 	 */
121 	ASSERT(servicing_interrupt());
122 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
123 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
124 
125 	/* mleq_nents is always a power of 2 */
126 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
127 
128 	ent = &mleq->mleq_ent[ci];
129 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
130 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
131 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
132 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
133 	    DDI_FME_VERSION);
134 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
135 		/* The PRM says we have to membar here, so we're doing it */
136 		membar_consumer();
137 		++mleq->mleq_cc;
138 		return (ent);
139 	}
140 	/*
141 	 * In the case of a DMA error, we should re-arm this EQ and then come
142 	 * back and try again when the device wakes us back up.
143 	 *
144 	 * Hopefully the fault will be gone by then.
145 	 */
146 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
147 
148 	return (NULL);
149 }
150 
151 void
152 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
153 {
154 	uint_t try = 0;
155 	ddi_fm_error_t err;
156 	bits32_t v = new_bits32();
157 
158 	/*
159 	 * This is only called during initialization when the EQ is
160 	 * armed for the first time, and when re-armed at the end of
161 	 * interrupt processing.
162 	 */
163 	ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt());
164 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
165 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
166 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
167 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
168 
169 	mleq->mleq_state |= MLXCX_EQ_ARMED;
170 	mleq->mleq_cc_armed = mleq->mleq_cc;
171 
172 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
173 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
174 
175 retry:
176 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
177 	    from_bits32(v));
178 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
179 	    DDI_FME_VERSION);
180 	if (err.fme_status == DDI_FM_OK)
181 		return;
182 	if (try++ < mlxcx_doorbell_tries) {
183 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
184 		goto retry;
185 	}
186 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
187 }
188 
189 static void
190 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
191 {
192 	bits32_t v = new_bits32();
193 	ddi_fm_error_t err;
194 
195 	/*
196 	 * This should only be called from interrupt context to ensure
197 	 * correctness of mleq_cc.
198 	 */
199 	ASSERT(servicing_interrupt());
200 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
201 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
202 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
203 
204 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
205 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
206 
207 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
208 	    from_bits32(v));
209 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
210 	    DDI_FME_VERSION);
211 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
212 	/*
213 	 * Ignore the error, if it's still happening when we try to re-arm the
214 	 * EQ, we will note the impact then.
215 	 */
216 }
217 
218 static mlxcx_completionq_ent_t *
219 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
220 {
221 	mlxcx_completionq_ent_t *ent;
222 	ddi_fm_error_t err;
223 	uint_t ci;
224 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
225 
226 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
227 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
228 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
229 
230 	/* mlcq_nents is always a power of 2 */
231 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
232 
233 	ent = &mlcq->mlcq_ent[ci];
234 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
235 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
236 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
237 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
238 	    DDI_FME_VERSION);
239 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
240 		/* The PRM says we have to membar here, so we're doing it */
241 		membar_consumer();
242 		++mlcq->mlcq_cc;
243 		return (ent);
244 	}
245 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
246 
247 	return (NULL);
248 }
249 
250 void
251 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
252 {
253 	ddi_fm_error_t err;
254 	uint_t try = 0;
255 
256 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
257 
258 retry:
259 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
260 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
261 	    DDI_FME_VERSION);
262 	if (err.fme_status != DDI_FM_OK) {
263 		if (try++ < mlxcx_doorbell_tries) {
264 			ddi_fm_dma_err_clear(
265 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
266 			    DDI_FME_VERSION);
267 			goto retry;
268 		} else {
269 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
270 			return;
271 		}
272 	}
273 }
274 
275 void
276 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
277 {
278 	bits32_t dbval = new_bits32();
279 	uint64_t udbval;
280 	ddi_fm_error_t err;
281 	uint_t try = 0;
282 
283 	ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx));
284 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
285 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
286 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
287 
288 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED) {
289 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
290 	}
291 
292 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
293 		return;
294 
295 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED);
296 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
297 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
298 
299 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
300 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
301 
302 	udbval = (uint64_t)from_bits32(dbval) << 32;
303 	udbval |= mlcq->mlcq_num & 0xffffff;
304 
305 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
306 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
307 
308 retry:
309 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
310 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
311 	    DDI_FME_VERSION);
312 	if (err.fme_status != DDI_FM_OK) {
313 		if (try++ < mlxcx_doorbell_tries) {
314 			ddi_fm_dma_err_clear(
315 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
316 			    DDI_FME_VERSION);
317 			goto retry;
318 		} else {
319 			goto err;
320 		}
321 	}
322 
323 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
324 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
325 	    DDI_FME_VERSION);
326 	if (err.fme_status == DDI_FM_OK)
327 		return;
328 	if (try++ < mlxcx_doorbell_tries) {
329 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
330 		goto retry;
331 	}
332 
333 err:
334 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
335 }
336 
337 const char *
338 mlxcx_event_name(mlxcx_event_t evt)
339 {
340 	switch (evt) {
341 	case MLXCX_EVENT_COMPLETION:
342 		return ("COMPLETION");
343 	case MLXCX_EVENT_PATH_MIGRATED:
344 		return ("PATH_MIGRATED");
345 	case MLXCX_EVENT_COMM_ESTABLISH:
346 		return ("COMM_ESTABLISH");
347 	case MLXCX_EVENT_SENDQ_DRAIN:
348 		return ("SENDQ_DRAIN");
349 	case MLXCX_EVENT_LAST_WQE:
350 		return ("LAST_WQE");
351 	case MLXCX_EVENT_SRQ_LIMIT:
352 		return ("SRQ_LIMIT");
353 	case MLXCX_EVENT_DCT_ALL_CLOSED:
354 		return ("DCT_ALL_CLOSED");
355 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
356 		return ("DCT_ACCKEY_VIOL");
357 	case MLXCX_EVENT_CQ_ERROR:
358 		return ("CQ_ERROR");
359 	case MLXCX_EVENT_WQ_CATASTROPHE:
360 		return ("WQ_CATASTROPHE");
361 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
362 		return ("PATH_MIGRATE_FAIL");
363 	case MLXCX_EVENT_PAGE_FAULT:
364 		return ("PAGE_FAULT");
365 	case MLXCX_EVENT_WQ_INVALID_REQ:
366 		return ("WQ_INVALID_REQ");
367 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
368 		return ("WQ_ACCESS_VIOL");
369 	case MLXCX_EVENT_SRQ_CATASTROPHE:
370 		return ("SRQ_CATASTROPHE");
371 	case MLXCX_EVENT_INTERNAL_ERROR:
372 		return ("INTERNAL_ERROR");
373 	case MLXCX_EVENT_PORT_STATE:
374 		return ("PORT_STATE");
375 	case MLXCX_EVENT_GPIO:
376 		return ("GPIO");
377 	case MLXCX_EVENT_PORT_MODULE:
378 		return ("PORT_MODULE");
379 	case MLXCX_EVENT_TEMP_WARNING:
380 		return ("TEMP_WARNING");
381 	case MLXCX_EVENT_REMOTE_CONFIG:
382 		return ("REMOTE_CONFIG");
383 	case MLXCX_EVENT_DCBX_CHANGE:
384 		return ("DCBX_CHANGE");
385 	case MLXCX_EVENT_DOORBELL_CONGEST:
386 		return ("DOORBELL_CONGEST");
387 	case MLXCX_EVENT_STALL_VL:
388 		return ("STALL_VL");
389 	case MLXCX_EVENT_CMD_COMPLETION:
390 		return ("CMD_COMPLETION");
391 	case MLXCX_EVENT_PAGE_REQUEST:
392 		return ("PAGE_REQUEST");
393 	case MLXCX_EVENT_NIC_VPORT:
394 		return ("NIC_VPORT");
395 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
396 		return ("EC_PARAMS_CHANGE");
397 	case MLXCX_EVENT_XRQ_ERROR:
398 		return ("XRQ_ERROR");
399 	}
400 	return ("UNKNOWN");
401 }
402 
403 /* Should be called only when link state has changed. */
404 void
405 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
406 {
407 	link_state_t ls;
408 
409 	mutex_enter(&port->mlp_mtx);
410 	(void) mlxcx_cmd_query_port_status(mlxp, port);
411 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
412 	(void) mlxcx_cmd_query_port_fec(mlxp, port);
413 
414 	switch (port->mlp_oper_status) {
415 	case MLXCX_PORT_STATUS_UP:
416 	case MLXCX_PORT_STATUS_UP_ONCE:
417 		ls = LINK_STATE_UP;
418 		break;
419 	case MLXCX_PORT_STATUS_DOWN:
420 		ls = LINK_STATE_DOWN;
421 		break;
422 	default:
423 		ls = LINK_STATE_UNKNOWN;
424 	}
425 	mac_link_update(mlxp->mlx_mac_hdl, ls);
426 
427 	mutex_exit(&port->mlp_mtx);
428 }
429 
430 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX);
431 
432 static void
433 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
434 {
435 	ddi_device_acc_attr_t acc;
436 	ddi_dma_attr_t attr;
437 	mlxcx_dev_page_t *mdp;
438 	mlxcx_dev_page_t **pages;
439 	size_t i;
440 	const ddi_dma_cookie_t *ck;
441 
442 	/*
443 	 * If this isn't enough, the HCA will ask for more
444 	 */
445 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
446 
447 	pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP);
448 
449 	for (i = 0; i < npages; i++) {
450 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
451 		mlxcx_dma_acc_attr(mlxp, &acc);
452 		mlxcx_dma_page_attr(mlxp, &attr);
453 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
454 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
455 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i,
456 			    npages);
457 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
458 			goto cleanup_npages;
459 		}
460 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
461 		mdp->mxdp_pa = ck->dmac_laddress;
462 		pages[i] = mdp;
463 	}
464 
465 	mutex_enter(&mlxp->mlx_pagemtx);
466 
467 	if (!mlxcx_cmd_give_pages(mlxp,
468 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
469 		mlxcx_warn(mlxp, "!hardware refused our gift of %lu "
470 		    "pages!", npages);
471 		mutex_exit(&mlxp->mlx_pagemtx);
472 		goto cleanup_npages;
473 	}
474 
475 	for (i = 0; i < npages; i++) {
476 		avl_add(&mlxp->mlx_pages, pages[i]);
477 	}
478 	mlxp->mlx_npages += npages;
479 	mutex_exit(&mlxp->mlx_pagemtx);
480 
481 	kmem_free(pages, sizeof (*pages) * npages);
482 
483 	return;
484 
485 cleanup_npages:
486 	for (i = 0; i < npages; i++) {
487 		if ((mdp = pages[i]) == NULL)
488 			break;
489 
490 		mlxcx_dma_free(&mdp->mxdp_dma);
491 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
492 	}
493 	/* Tell the hardware we had an allocation failure. */
494 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
495 	    0, NULL);
496 	mutex_exit(&mlxp->mlx_pagemtx);
497 
498 	kmem_free(pages, sizeof (*pages) * npages);
499 }
500 
501 static void
502 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
503 {
504 	uint_t i;
505 	int32_t ret;
506 	uint64_t *pas;
507 	mlxcx_dev_page_t *mdp, probe;
508 
509 	pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP);
510 
511 	if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) {
512 		kmem_free(pas, sizeof (*pas) * npages);
513 		return;
514 	}
515 
516 	mutex_enter(&mlxp->mlx_pagemtx);
517 
518 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
519 
520 	for (i = 0; i < ret; i++) {
521 		bzero(&probe, sizeof (probe));
522 		probe.mxdp_pa = pas[i];
523 
524 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
525 
526 		if (mdp != NULL) {
527 			avl_remove(&mlxp->mlx_pages, mdp);
528 			mlxp->mlx_npages--;
529 			mlxcx_dma_free(&mdp->mxdp_dma);
530 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
531 		} else {
532 			mlxcx_warn(mlxp, "hardware returned a page "
533 			    "with PA 0x%" PRIx64 " but we have no "
534 			    "record of giving out such a page", pas[i]);
535 		}
536 	}
537 
538 	mutex_exit(&mlxp->mlx_pagemtx);
539 
540 	kmem_free(pas, sizeof (*pas) * npages);
541 }
542 
543 static void
544 mlxcx_pages_task(void *arg)
545 {
546 	mlxcx_async_param_t *param = arg;
547 	mlxcx_t *mlxp = param->mla_mlx;
548 	int32_t npages;
549 
550 	/*
551 	 * We can drop the pending status now, as we've extracted what
552 	 * is needed to process the pages request.
553 	 *
554 	 * Even though we should never get another pages request until
555 	 * we have responded to this, along with the guard in mlxcx_sync_intr,
556 	 * this safely allows the reuse of mlxcx_async_param_t.
557 	 */
558 	mutex_enter(&param->mla_mtx);
559 	npages = param->mla_pages.mlp_npages;
560 	param->mla_pending = B_FALSE;
561 	bzero(&param->mla_pages, sizeof (param->mla_pages));
562 	mutex_exit(&param->mla_mtx);
563 
564 	/*
565 	 * The PRM describes npages as: "Number of missing / unneeded pages
566 	 * (signed number, msb indicate sign)". The implication is that
567 	 * it will not be zero. We are expected to use this to give or
568 	 * take back pages (based on the sign) using the MANAGE_PAGES
569 	 * command but we can't determine whether to give or take
570 	 * when npages is zero. So we do nothing.
571 	 */
572 	if (npages > 0) {
573 		mlxcx_give_pages_once(mlxp, npages);
574 	} else if (npages < 0) {
575 		mlxcx_take_pages_once(mlxp, -1 * npages);
576 	}
577 }
578 
579 static void
580 mlxcx_link_state_task(void *arg)
581 {
582 	mlxcx_async_param_t *param = arg;
583 	mlxcx_port_t *port;
584 	mlxcx_t *mlxp;
585 
586 	/*
587 	 * Gather the argruments from the parameters and clear the
588 	 * pending status.
589 	 *
590 	 * The pending status must be cleared *before* we update the
591 	 * link state. This is both safe and required to ensure we always
592 	 * have the correct link state. It is safe because taskq_ents are
593 	 * reusable (by the caller of taskq_dispatch_ent()) once the
594 	 * task function has started executing. It is necessarily before
595 	 * updating the link state to guarantee further link state change
596 	 * events are not missed and we always have the current link state.
597 	 */
598 	mutex_enter(&param->mla_mtx);
599 	mlxp = param->mla_mlx;
600 	port = param->mla_port;
601 	param->mla_pending = B_FALSE;
602 	mutex_exit(&param->mla_mtx);
603 
604 	mlxcx_update_link_state(mlxp, port);
605 }
606 
607 static const char *
608 mlxcx_module_error_string(mlxcx_module_error_type_t err)
609 {
610 	switch (err) {
611 	case MLXCX_MODULE_ERR_POWER_BUDGET:
612 		return ("POWER_BUDGET");
613 	case MLXCX_MODULE_ERR_LONG_RANGE:
614 		return ("LONG_RANGE");
615 	case MLXCX_MODULE_ERR_BUS_STUCK:
616 		return ("BUS_STUCK");
617 	case MLXCX_MODULE_ERR_NO_EEPROM:
618 		return ("NO_EEPROM");
619 	case MLXCX_MODULE_ERR_ENFORCEMENT:
620 		return ("ENFORCEMENT");
621 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
622 		return ("UNKNOWN_IDENT");
623 	case MLXCX_MODULE_ERR_HIGH_TEMP:
624 		return ("HIGH_TEMP");
625 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
626 		return ("CABLE_SHORTED");
627 	default:
628 		return ("UNKNOWN");
629 	}
630 }
631 
632 static void
633 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
634 {
635 	uint64_t ena;
636 	char buf[FM_MAX_CLASS];
637 	const char *lename;
638 	const char *ename;
639 	const char *stname;
640 	uint_t eno = 0;
641 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
642 
643 	switch (state) {
644 	case MLXCX_MODULE_ERROR:
645 		stname = "error";
646 		eno = evd->mled_port_mod_error_type;
647 		lename = mlxcx_module_error_string(eno);
648 		switch (eno) {
649 		case MLXCX_MODULE_ERR_ENFORCEMENT:
650 			ename = DDI_FM_TXR_ERROR_WHITELIST;
651 			break;
652 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
653 		case MLXCX_MODULE_ERR_NO_EEPROM:
654 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
655 			break;
656 		case MLXCX_MODULE_ERR_HIGH_TEMP:
657 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
658 			break;
659 		case MLXCX_MODULE_ERR_POWER_BUDGET:
660 		case MLXCX_MODULE_ERR_LONG_RANGE:
661 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
662 			ename = DDI_FM_TXR_ERROR_HWFAIL;
663 			break;
664 		case MLXCX_MODULE_ERR_BUS_STUCK:
665 		default:
666 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
667 		}
668 		break;
669 	default:
670 		return;
671 	}
672 
673 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
674 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
675 	ena = fm_ena_generate(0, FM_ENA_FMT1);
676 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
677 		return;
678 
679 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
680 	    /* compulsory FM props */
681 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
682 	    /* generic NIC txr error event props */
683 	    "error", DATA_TYPE_STRING, ename,
684 	    "port_index", DATA_TYPE_UINT8, 0,
685 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
686 	    /* local props */
687 	    "mlxcx_state", DATA_TYPE_STRING, stname,
688 	    "mlxcx_error", DATA_TYPE_STRING, lename,
689 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
690 	    NULL);
691 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
692 }
693 
694 /*
695  * Common beginning of interrupt processing.
696  * Confirm interrupt hasn't been disabled, verify its state and
697  * mark the vector as active.
698  */
699 static boolean_t
700 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
701 {
702 	mutex_enter(&mleq->mleq_mtx);
703 
704 	if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
705 		mutex_exit(&mleq->mleq_mtx);
706 		return (B_FALSE);
707 	}
708 
709 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
710 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
711 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
712 		mlxcx_warn(mlxp, "intr %d in bad eq state",
713 		    mleq->mleq_intr_index);
714 		mutex_exit(&mleq->mleq_mtx);
715 		return (B_FALSE);
716 	}
717 
718 	mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE;
719 	mutex_exit(&mleq->mleq_mtx);
720 
721 	return (B_TRUE);
722 }
723 
724 /*
725  * End of interrupt processing.
726  * Mark vector as no longer active and if shutdown is blocked on this vector,
727  * wake it up.
728  */
729 static void
730 mlxcx_intr_fini(mlxcx_event_queue_t *mleq)
731 {
732 	mutex_enter(&mleq->mleq_mtx);
733 	if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0)
734 		cv_signal(&mleq->mleq_cv);
735 
736 	mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE;
737 	mutex_exit(&mleq->mleq_mtx);
738 }
739 
740 static uint_t
741 mlxcx_intr_async(caddr_t arg, caddr_t arg2)
742 {
743 	mlxcx_t *mlxp = (mlxcx_t *)arg;
744 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
745 	mlxcx_eventq_ent_t *ent;
746 	mlxcx_async_param_t *param;
747 	uint_t portn;
748 	uint16_t func;
749 
750 	if (!mlxcx_intr_ini(mlxp, mleq))
751 		return (DDI_INTR_CLAIMED);
752 
753 	ent = mlxcx_eq_next(mleq);
754 	if (ent == NULL) {
755 		goto done;
756 	}
757 
758 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
759 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
760 
761 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
762 		DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *,
763 		    ent);
764 
765 		switch (ent->mleqe_event_type) {
766 		case MLXCX_EVENT_CMD_COMPLETION:
767 			mlxcx_cmd_completion(mlxp, ent);
768 			break;
769 		case MLXCX_EVENT_PAGE_REQUEST:
770 			func = from_be16(ent->mleqe_page_request.
771 			    mled_page_request_function_id);
772 			VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX);
773 
774 			param = &mlxp->mlx_npages_req[func];
775 			mutex_enter(&param->mla_mtx);
776 			if (param->mla_pending) {
777 				/*
778 				 * The PRM states we will not get another
779 				 * page request event until any pending have
780 				 * been posted as complete to the HCA.
781 				 * This will guard against this anyway.
782 				 */
783 				mutex_exit(&param->mla_mtx);
784 				mlxcx_warn(mlxp, "Unexpected page request "
785 				    "whilst another is pending");
786 				break;
787 			}
788 			param->mla_pages.mlp_npages =
789 			    (int32_t)from_be32(ent->mleqe_page_request.
790 			    mled_page_request_num_pages);
791 			param->mla_pages.mlp_func = func;
792 			param->mla_pending = B_TRUE;
793 			ASSERT3P(param->mla_mlx, ==, mlxp);
794 			mutex_exit(&param->mla_mtx);
795 
796 			taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task,
797 			    param, 0, &param->mla_tqe);
798 			break;
799 		case MLXCX_EVENT_PORT_STATE:
800 			portn = get_bits8(
801 			    ent->mleqe_port_state.mled_port_state_port_num,
802 			    MLXCX_EVENT_PORT_NUM) - 1;
803 			if (portn >= mlxp->mlx_nports)
804 				break;
805 
806 			param = &mlxp->mlx_ports[portn].mlx_port_event;
807 			mutex_enter(&param->mla_mtx);
808 			if (param->mla_pending) {
809 				/*
810 				 * There is a link state event pending
811 				 * processing. When that event is handled
812 				 * it will get the current link state.
813 				 */
814 				mutex_exit(&param->mla_mtx);
815 				break;
816 			}
817 
818 			ASSERT3P(param->mla_mlx, ==, mlxp);
819 			ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]);
820 
821 			param->mla_pending = B_TRUE;
822 			mutex_exit(&param->mla_mtx);
823 
824 			taskq_dispatch_ent(mlxp->mlx_async_tq,
825 			    mlxcx_link_state_task, param, 0, &param->mla_tqe);
826 			break;
827 		case MLXCX_EVENT_PORT_MODULE:
828 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
829 			break;
830 		default:
831 			mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d",
832 			    ent->mleqe_event_type, mleq->mleq_intr_index);
833 		}
834 	}
835 
836 	mlxcx_arm_eq(mlxp, mleq);
837 
838 done:
839 	mlxcx_intr_fini(mleq);
840 	return (DDI_INTR_CLAIMED);
841 }
842 
843 static boolean_t
844 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
845     size_t bytelim)
846 {
847 	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
848 	mlxcx_completionq_ent_t *cent;
849 	mblk_t *mp, *cmp, *nmp;
850 	mlxcx_buffer_t *buf;
851 	boolean_t found, added;
852 	size_t bytes = 0;
853 	uint_t rx_frames = 0;
854 	uint_t comp_cnt = 0;
855 	int64_t wqebbs, bufcnt;
856 
857 	*mpp = NULL;
858 
859 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
860 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
861 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
862 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
863 		return (B_FALSE);
864 	}
865 
866 	nmp = cmp = mp = NULL;
867 
868 	wqebbs = 0;
869 	bufcnt = 0;
870 	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
871 	    cent = mlxcx_cq_next(mlcq)) {
872 		/*
873 		 * Teardown and ring stop can atomic_or this flag
874 		 * into our state if they want us to stop early.
875 		 */
876 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
877 			return (B_FALSE);
878 
879 		comp_cnt++;
880 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
881 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
882 			/* NOP */
883 			atomic_dec_64(&wq->mlwq_wqebb_used);
884 			goto nextcq;
885 		}
886 
887 lookagain:
888 		/*
889 		 * Generally the buffer we're looking for will be
890 		 * at the front of the list, so this loop won't
891 		 * need to look far.
892 		 */
893 		buf = list_head(&mlcq->mlcq_buffers);
894 		found = B_FALSE;
895 		while (buf != NULL) {
896 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
897 			    from_be16(cent->mlcqe_wqe_counter)) {
898 				found = B_TRUE;
899 				break;
900 			}
901 			buf = list_next(&mlcq->mlcq_buffers, buf);
902 		}
903 
904 		if (!found) {
905 			/*
906 			 * If there's any buffers waiting on the
907 			 * buffers_b list, then merge those into
908 			 * the main list and have another look.
909 			 *
910 			 * The wq enqueue routines push new buffers
911 			 * into buffers_b so that they can avoid
912 			 * taking the mlcq_mtx and blocking us for
913 			 * every single packet.
914 			 */
915 			added = B_FALSE;
916 			mutex_enter(&mlcq->mlcq_bufbmtx);
917 			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
918 				list_move_tail(&mlcq->mlcq_buffers,
919 				    &mlcq->mlcq_buffers_b);
920 				added = B_TRUE;
921 			}
922 			mutex_exit(&mlcq->mlcq_bufbmtx);
923 			if (added)
924 				goto lookagain;
925 
926 			/*
927 			 * This check could go just after the lookagain
928 			 * label, but it is a hot code path so we don't
929 			 * want to unnecessarily grab a lock and check
930 			 * a flag for a relatively rare event (the ring
931 			 * being stopped).
932 			 */
933 			mutex_enter(&wq->mlwq_mtx);
934 			if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
935 				mutex_exit(&wq->mlwq_mtx);
936 				goto nextcq;
937 			}
938 			mutex_exit(&wq->mlwq_mtx);
939 
940 			buf = list_head(&mlcq->mlcq_buffers);
941 			mlxcx_warn(mlxp, "got completion on CQ %x but "
942 			    "no buffer matching wqe found: %x (first "
943 			    "buffer counter = %x)", mlcq->mlcq_num,
944 			    from_be16(cent->mlcqe_wqe_counter),
945 			    buf == NULL ? UINT32_MAX :
946 			    buf->mlb_wqe_index);
947 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
948 			goto nextcq;
949 		}
950 
951 		/*
952 		 * The buf is likely to be freed below, count this now.
953 		 */
954 		wqebbs += buf->mlb_wqebbs;
955 
956 		list_remove(&mlcq->mlcq_buffers, buf);
957 		bufcnt++;
958 
959 		switch (mlcq->mlcq_wq->mlwq_type) {
960 		case MLXCX_WQ_TYPE_SENDQ:
961 			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
962 			break;
963 		case MLXCX_WQ_TYPE_RECVQ:
964 			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
965 			bytes += from_be32(cent->mlcqe_byte_cnt);
966 			if (nmp != NULL) {
967 				if (cmp != NULL) {
968 					cmp->b_next = nmp;
969 					cmp = nmp;
970 				} else {
971 					mp = cmp = nmp;
972 				}
973 
974 				rx_frames++;
975 			}
976 			break;
977 		}
978 
979 		/*
980 		 * Update the consumer index with what has been processed,
981 		 * followed by driver counters. It is important to tell the
982 		 * hardware first, otherwise when we throw more packets at
983 		 * it, it may get an overflow error.
984 		 * We do this whenever we've processed enough to bridge the
985 		 * high->low water mark.
986 		 */
987 		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
988 			mlxcx_update_cqci(mlxp, mlcq);
989 			/*
990 			 * Both these variables are incremented using
991 			 * atomics as they are modified in other code paths
992 			 * (Eg during tx) which hold different locks.
993 			 */
994 			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
995 			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
996 			wqebbs = 0;
997 			bufcnt = 0;
998 			comp_cnt = 0;
999 		}
1000 nextcq:
1001 		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
1002 		    (bytelim != 0 && bytes > bytelim))
1003 			break;
1004 	}
1005 
1006 	if (comp_cnt > 0) {
1007 		mlxcx_update_cqci(mlxp, mlcq);
1008 		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1009 		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1010 	}
1011 
1012 	*mpp = mp;
1013 	return (B_TRUE);
1014 }
1015 
1016 
1017 mblk_t *
1018 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
1019 {
1020 	mblk_t *mp = NULL;
1021 
1022 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1023 
1024 	ASSERT(mlcq->mlcq_wq != NULL);
1025 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
1026 
1027 	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
1028 
1029 	return (mp);
1030 }
1031 
1032 static uint_t
1033 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
1034 {
1035 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1036 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
1037 	mlxcx_eventq_ent_t *ent;
1038 	mlxcx_completion_queue_t *mlcq, probe;
1039 	mlxcx_work_queue_t *mlwq;
1040 	mblk_t *mp = NULL;
1041 	boolean_t tellmac = B_FALSE;
1042 
1043 	if (!mlxcx_intr_ini(mlxp, mleq))
1044 		return (DDI_INTR_CLAIMED);
1045 
1046 	ent = mlxcx_eq_next(mleq);
1047 	if (ent == NULL) {
1048 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
1049 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
1050 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1051 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1052 			    mleq->mleq_intr_index]);
1053 		}
1054 		goto done;
1055 	}
1056 	mleq->mleq_badintrs = 0;
1057 
1058 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
1059 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
1060 
1061 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
1062 		if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) {
1063 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
1064 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1065 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1066 			    mleq->mleq_intr_index]);
1067 			goto done;
1068 		}
1069 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
1070 
1071 		probe.mlcq_num =
1072 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
1073 		mutex_enter(&mleq->mleq_mtx);
1074 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
1075 		mutex_exit(&mleq->mleq_mtx);
1076 
1077 		if (mlcq == NULL)
1078 			continue;
1079 
1080 		mlwq = mlcq->mlcq_wq;
1081 
1082 		/*
1083 		 * mlcq_arm_mtx is used to avoid race conditions between
1084 		 * this interrupt routine and the transition from polling
1085 		 * back to interrupt mode. When exiting poll mode the
1086 		 * CQ is likely to be un-armed, which means there will
1087 		 * be no events for the CQ coming though here,
1088 		 * consequently very low contention on mlcq_arm_mtx.
1089 		 *
1090 		 * mlcq_arm_mtx must be released before calls into mac
1091 		 * layer in order to avoid deadlocks.
1092 		 */
1093 		mutex_enter(&mlcq->mlcq_arm_mtx);
1094 		mlcq->mlcq_ec++;
1095 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
1096 
1097 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
1098 			/*
1099 			 * If we failed to take the mutex because the
1100 			 * polling function has it, just move on.
1101 			 * We don't want to block other CQs behind
1102 			 * this one.
1103 			 */
1104 			if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) {
1105 				mutex_exit(&mlcq->mlcq_arm_mtx);
1106 				goto update_eq;
1107 			}
1108 
1109 			/* Otherwise we will wait. */
1110 			mutex_enter(&mlcq->mlcq_mtx);
1111 		}
1112 
1113 		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
1114 		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
1115 			/*
1116 			 * The ring is not in polling mode and we processed
1117 			 * some completion queue entries.
1118 			 */
1119 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
1120 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
1121 				atomic_and_uint(&mlcq->mlcq_state,
1122 				    ~MLXCX_CQ_BLOCKED_MAC);
1123 				tellmac = B_TRUE;
1124 			}
1125 
1126 			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
1127 			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
1128 				atomic_and_uint(&mlwq->mlwq_state,
1129 				    ~MLXCX_WQ_BLOCKED_MAC);
1130 				tellmac = B_TRUE;
1131 			}
1132 
1133 			mlxcx_arm_cq(mlxp, mlcq);
1134 
1135 			mutex_exit(&mlcq->mlcq_mtx);
1136 			mutex_exit(&mlcq->mlcq_arm_mtx);
1137 
1138 			if (tellmac) {
1139 				mac_tx_ring_update(mlxp->mlx_mac_hdl,
1140 				    mlcq->mlcq_mac_hdl);
1141 				tellmac = B_FALSE;
1142 			}
1143 
1144 			if (mp != NULL) {
1145 				mac_rx_ring(mlxp->mlx_mac_hdl,
1146 				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
1147 			}
1148 		} else {
1149 			mutex_exit(&mlcq->mlcq_mtx);
1150 			mutex_exit(&mlcq->mlcq_arm_mtx);
1151 		}
1152 
1153 update_eq:
1154 		/*
1155 		 * Updating the consumer counter for an EQ requires a write
1156 		 * to the UAR, which is possibly expensive.
1157 		 *
1158 		 * Try to do it only often enough to stop us wrapping around.
1159 		 */
1160 		if ((mleq->mleq_cc & 0x7) == 0)
1161 			mlxcx_update_eq(mlxp, mleq);
1162 	}
1163 
1164 	mlxcx_arm_eq(mlxp, mleq);
1165 
1166 done:
1167 	mlxcx_intr_fini(mleq);
1168 	return (DDI_INTR_CLAIMED);
1169 }
1170 
1171 boolean_t
1172 mlxcx_intr_setup(mlxcx_t *mlxp)
1173 {
1174 	dev_info_t *dip = mlxp->mlx_dip;
1175 	int ret;
1176 	int nintrs = 0;
1177 	int navail = 0;
1178 	int types, i;
1179 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
1180 
1181 	ret = ddi_intr_get_supported_types(dip, &types);
1182 	if (ret != DDI_SUCCESS) {
1183 		mlxcx_warn(mlxp, "Failed to get supported interrupt types");
1184 		return (B_FALSE);
1185 	}
1186 
1187 	if (!(types & DDI_INTR_TYPE_MSIX)) {
1188 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
1189 		    "requires MSI-X");
1190 		return (B_FALSE);
1191 	}
1192 
1193 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
1194 	if (ret != DDI_SUCCESS) {
1195 		mlxcx_warn(mlxp, "Failed to get number of interrupts");
1196 		return (B_FALSE);
1197 	}
1198 	if (nintrs < 2) {
1199 		mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx "
1200 		    "requires 2", nintrs);
1201 		return (B_FALSE);
1202 	}
1203 
1204 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
1205 	if (ret != DDI_SUCCESS) {
1206 		mlxcx_warn(mlxp,
1207 		    "Failed to get number of available interrupts");
1208 		return (B_FALSE);
1209 	}
1210 	if (navail < 2) {
1211 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
1212 		    "requires 2", navail);
1213 		return (B_FALSE);
1214 	}
1215 
1216 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
1217 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
1218 	/*
1219 	 * Interrupts for Completion Queues events start from vector 1
1220 	 * up to available vectors. Vector 0 is used for asynchronous
1221 	 * events.
1222 	 */
1223 	mlxp->mlx_intr_cq0 = 1;
1224 
1225 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
1226 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
1227 	if (ret != DDI_SUCCESS) {
1228 		mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
1229 		mlxcx_intr_teardown(mlxp);
1230 		return (B_FALSE);
1231 	}
1232 	if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
1233 		mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
1234 		    "requires %d", mlxp->mlx_intr_count,
1235 		    mlxp->mlx_intr_cq0 + 1);
1236 		mlxcx_intr_teardown(mlxp);
1237 		return (B_FALSE);
1238 	}
1239 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
1240 
1241 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
1242 	if (ret != DDI_SUCCESS) {
1243 		mlxcx_warn(mlxp, "Failed to get interrupt priority");
1244 		mlxcx_intr_teardown(mlxp);
1245 		return (B_FALSE);
1246 	}
1247 
1248 	/*
1249 	 * Set the interrupt priority for the asynchronous handler higher
1250 	 * than the ring handlers. Some operations which issue commands,
1251 	 * and thus rely on the async interrupt handler for posting
1252 	 * completion, do so with a CQ mutex held. The CQ mutex is also
1253 	 * acquired during ring processing, so if the ring processing vector
1254 	 * happens to be assigned to the same CPU as the async vector
1255 	 * it can hold off the async interrupt thread and lead to a deadlock.
1256 	 * By assigning a higher priority to the async vector, it will
1257 	 * always be dispatched.
1258 	 */
1259 	mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
1260 	if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
1261 		mlxp->mlx_async_intr_pri++;
1262 	} else {
1263 		mlxp->mlx_intr_pri--;
1264 	}
1265 
1266 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1267 	    sizeof (mlxcx_event_queue_t);
1268 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1269 
1270 	/*
1271 	 * In the failure path, mlxcx_intr_teardown() expects this
1272 	 * mutex and avl tree to be init'ed - so do it now.
1273 	 */
1274 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1275 		uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
1276 		    mlxp->mlx_intr_pri;
1277 
1278 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1279 		    DDI_INTR_PRI(pri));
1280 		cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);
1281 
1282 		if (i < mlxp->mlx_intr_cq0)
1283 			continue;
1284 
1285 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1286 		    sizeof (mlxcx_completion_queue_t),
1287 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1288 	}
1289 
1290 	while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) {
1291 		ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
1292 		    mlxp->mlx_async_intr_pri);
1293 		if (ret == DDI_SUCCESS)
1294 			break;
1295 		mlxcx_note(mlxp,
1296 		    "!Failed to set interrupt priority to %u for "
1297 		    "async interrupt vector", mlxp->mlx_async_intr_pri);
1298 		/*
1299 		 * If it was not possible to set the IPL for the async
1300 		 * interrupt to the desired value, then try a lower priority.
1301 		 * Some PSMs can only accommodate a limited number of vectors
1302 		 * at eatch priority level (or group of priority levels). Since
1303 		 * the async priority must be set higher than the ring
1304 		 * handlers, lower both. The ring handler priority is set
1305 		 * below.
1306 		 */
1307 		mlxp->mlx_async_intr_pri--;
1308 		mlxp->mlx_intr_pri--;
1309 	}
1310 
1311 	if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) {
1312 		mlxcx_warn(mlxp, "Failed to find an interrupt priority for "
1313 		    "async interrupt vector");
1314 		mlxcx_intr_teardown(mlxp);
1315 		return (B_FALSE);
1316 	}
1317 
1318 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
1319 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1320 	if (ret != DDI_SUCCESS) {
1321 		mlxcx_warn(mlxp, "Failed to add async interrupt handler");
1322 		mlxcx_intr_teardown(mlxp);
1323 		return (B_FALSE);
1324 	}
1325 
1326 	/*
1327 	 * If we have enough interrupts, set their "type" fields so that we
1328 	 * avoid mixing RX and TX queues on the same EQs.
1329 	 */
1330 	if (mlxp->mlx_intr_count >= 8) {
1331 		eqt = MLXCX_EQ_TYPE_RX;
1332 	}
1333 
1334 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
1335 		mlxp->mlx_eqs[i].mleq_intr_index = i;
1336 
1337 		mlxp->mlx_eqs[i].mleq_type = eqt;
1338 		/*
1339 		 * If eqt is still ANY, just leave it set to that
1340 		 * (no else here).
1341 		 */
1342 		if (eqt == MLXCX_EQ_TYPE_RX) {
1343 			eqt = MLXCX_EQ_TYPE_TX;
1344 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
1345 			eqt = MLXCX_EQ_TYPE_RX;
1346 		}
1347 
1348 		while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) {
1349 			ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
1350 			    mlxp->mlx_intr_pri);
1351 			if (ret == DDI_SUCCESS)
1352 				break;
1353 			mlxcx_note(mlxp, "!Failed to set interrupt priority to "
1354 			    "%u for interrupt vector %d",
1355 			    mlxp->mlx_intr_pri, i);
1356 			mlxp->mlx_intr_pri--;
1357 		}
1358 		if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) {
1359 			mlxcx_warn(mlxp,
1360 			    "Failed to find an interrupt priority for "
1361 			    "interrupt vector %d", i);
1362 			mlxcx_intr_teardown(mlxp);
1363 			return (B_FALSE);
1364 		}
1365 
1366 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1367 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1368 		if (ret != DDI_SUCCESS) {
1369 			mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
1370 			    i);
1371 			mlxcx_intr_teardown(mlxp);
1372 			return (B_FALSE);
1373 		}
1374 	}
1375 
1376 	return (B_TRUE);
1377 }
1378