xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision 3184921aa9155f2314caa4909eba31a0be558b3d)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2021, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/disp.h>
27 #include <sys/sdt.h>
28 
29 #include <sys/mac_provider.h>
30 
31 #include <mlxcx.h>
32 
33 /*
34  * CTASSERT(s) to cover bad values which would induce bugs.
35  */
36 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
37 
38 /*
39  * Disable interrupts.
40  * The act of calling ddi_intr_disable() does not guarantee an interrupt
41  * routine is not running, so flag the vector as quiescing and wait
42  * for anything active to finish.
43  */
44 void
45 mlxcx_intr_disable(mlxcx_t *mlxp)
46 {
47 	int i;
48 
49 	mlxcx_cmd_eq_disable(mlxp);
50 
51 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
52 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
53 
54 		mutex_enter(&mleq->mleq_mtx);
55 
56 		if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
57 			mutex_exit(&mleq->mleq_mtx);
58 			continue;
59 		}
60 
61 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
62 
63 		mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE;
64 		while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0)
65 			cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx);
66 
67 		mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED;
68 
69 		mutex_exit(&mleq->mleq_mtx);
70 	}
71 }
72 
73 void
74 mlxcx_intr_teardown(mlxcx_t *mlxp)
75 {
76 	int i;
77 	int ret;
78 
79 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
80 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
81 
82 		mutex_enter(&mleq->mleq_mtx);
83 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
84 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
85 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
86 		if (i >= mlxp->mlx_intr_cq0) {
87 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
88 			avl_destroy(&mleq->mleq_cqs);
89 		}
90 		mutex_exit(&mleq->mleq_mtx);
91 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
92 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
93 		if (ret != DDI_SUCCESS) {
94 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
95 			    i, ret);
96 		}
97 		mutex_destroy(&mleq->mleq_mtx);
98 		cv_destroy(&mleq->mleq_cv);
99 	}
100 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
101 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
102 	mlxp->mlx_intr_handles = NULL;
103 	mlxp->mlx_eqs = NULL;
104 }
105 
106 /*
107  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
108  */
109 static mlxcx_eventq_ent_t *
110 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
111 {
112 	mlxcx_eventq_ent_t *ent;
113 	ddi_fm_error_t err;
114 	uint_t ci;
115 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
116 
117 	/*
118 	 * This should only be called from interrupt context to ensure
119 	 * correctness of mleq_cc.
120 	 */
121 	ASSERT(servicing_interrupt());
122 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
123 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
124 
125 	/* mleq_nents is always a power of 2 */
126 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
127 
128 	ent = &mleq->mleq_ent[ci];
129 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
130 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
131 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
132 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
133 	    DDI_FME_VERSION);
134 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
135 		/* The PRM says we have to membar here, so we're doing it */
136 		membar_consumer();
137 		++mleq->mleq_cc;
138 		return (ent);
139 	}
140 	/*
141 	 * In the case of a DMA error, we should re-arm this EQ and then come
142 	 * back and try again when the device wakes us back up.
143 	 *
144 	 * Hopefully the fault will be gone by then.
145 	 */
146 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
147 
148 	return (NULL);
149 }
150 
151 void
152 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
153 {
154 	uint_t try = 0;
155 	ddi_fm_error_t err;
156 	bits32_t v = new_bits32();
157 
158 	/*
159 	 * This is only called during initialization when the EQ is
160 	 * armed for the first time, and when re-armed at the end of
161 	 * interrupt processing.
162 	 */
163 	ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt());
164 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
165 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
166 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
167 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
168 
169 	mleq->mleq_state |= MLXCX_EQ_ARMED;
170 	mleq->mleq_cc_armed = mleq->mleq_cc;
171 
172 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
173 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
174 
175 retry:
176 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
177 	    from_bits32(v));
178 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
179 	    DDI_FME_VERSION);
180 	if (err.fme_status == DDI_FM_OK)
181 		return;
182 	if (try++ < mlxcx_doorbell_tries) {
183 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
184 		goto retry;
185 	}
186 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
187 }
188 
189 static void
190 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
191 {
192 	bits32_t v = new_bits32();
193 	ddi_fm_error_t err;
194 
195 	/*
196 	 * This should only be called from interrupt context to ensure
197 	 * correctness of mleq_cc.
198 	 */
199 	ASSERT(servicing_interrupt());
200 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
201 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
202 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
203 
204 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
205 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
206 
207 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
208 	    from_bits32(v));
209 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
210 	    DDI_FME_VERSION);
211 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
212 	/*
213 	 * Ignore the error, if it's still happening when we try to re-arm the
214 	 * EQ, we will note the impact then.
215 	 */
216 }
217 
218 static mlxcx_completionq_ent_t *
219 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
220 {
221 	mlxcx_completionq_ent_t *ent;
222 	ddi_fm_error_t err;
223 	uint_t ci;
224 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
225 
226 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
227 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
228 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
229 
230 	/* mlcq_nents is always a power of 2 */
231 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
232 
233 	ent = &mlcq->mlcq_ent[ci];
234 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
235 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
236 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
237 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
238 	    DDI_FME_VERSION);
239 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
240 		/* The PRM says we have to membar here, so we're doing it */
241 		membar_consumer();
242 		++mlcq->mlcq_cc;
243 		return (ent);
244 	}
245 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
246 
247 	return (NULL);
248 }
249 
250 void
251 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
252 {
253 	ddi_fm_error_t err;
254 	uint_t try = 0;
255 
256 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
257 
258 retry:
259 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
260 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
261 	    DDI_FME_VERSION);
262 	if (err.fme_status != DDI_FM_OK) {
263 		if (try++ < mlxcx_doorbell_tries) {
264 			ddi_fm_dma_err_clear(
265 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
266 			    DDI_FME_VERSION);
267 			goto retry;
268 		} else {
269 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
270 			return;
271 		}
272 	}
273 }
274 
275 void
276 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
277 {
278 	bits32_t dbval = new_bits32();
279 	uint64_t udbval;
280 	ddi_fm_error_t err;
281 	uint_t try = 0;
282 
283 	ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx));
284 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
285 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
286 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
287 
288 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED) {
289 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
290 	}
291 
292 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
293 		return;
294 
295 	atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED);
296 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
297 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
298 
299 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
300 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
301 
302 	udbval = (uint64_t)from_bits32(dbval) << 32;
303 	udbval |= mlcq->mlcq_num & 0xffffff;
304 
305 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
306 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
307 
308 retry:
309 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
310 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
311 	    DDI_FME_VERSION);
312 	if (err.fme_status != DDI_FM_OK) {
313 		if (try++ < mlxcx_doorbell_tries) {
314 			ddi_fm_dma_err_clear(
315 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
316 			    DDI_FME_VERSION);
317 			goto retry;
318 		} else {
319 			goto err;
320 		}
321 	}
322 
323 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
324 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
325 	    DDI_FME_VERSION);
326 	if (err.fme_status == DDI_FM_OK)
327 		return;
328 	if (try++ < mlxcx_doorbell_tries) {
329 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
330 		goto retry;
331 	}
332 
333 err:
334 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
335 }
336 
337 const char *
338 mlxcx_event_name(mlxcx_event_t evt)
339 {
340 	switch (evt) {
341 	case MLXCX_EVENT_COMPLETION:
342 		return ("COMPLETION");
343 	case MLXCX_EVENT_PATH_MIGRATED:
344 		return ("PATH_MIGRATED");
345 	case MLXCX_EVENT_COMM_ESTABLISH:
346 		return ("COMM_ESTABLISH");
347 	case MLXCX_EVENT_SENDQ_DRAIN:
348 		return ("SENDQ_DRAIN");
349 	case MLXCX_EVENT_LAST_WQE:
350 		return ("LAST_WQE");
351 	case MLXCX_EVENT_SRQ_LIMIT:
352 		return ("SRQ_LIMIT");
353 	case MLXCX_EVENT_DCT_ALL_CLOSED:
354 		return ("DCT_ALL_CLOSED");
355 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
356 		return ("DCT_ACCKEY_VIOL");
357 	case MLXCX_EVENT_CQ_ERROR:
358 		return ("CQ_ERROR");
359 	case MLXCX_EVENT_WQ_CATASTROPHE:
360 		return ("WQ_CATASTROPHE");
361 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
362 		return ("PATH_MIGRATE_FAIL");
363 	case MLXCX_EVENT_PAGE_FAULT:
364 		return ("PAGE_FAULT");
365 	case MLXCX_EVENT_WQ_INVALID_REQ:
366 		return ("WQ_INVALID_REQ");
367 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
368 		return ("WQ_ACCESS_VIOL");
369 	case MLXCX_EVENT_SRQ_CATASTROPHE:
370 		return ("SRQ_CATASTROPHE");
371 	case MLXCX_EVENT_INTERNAL_ERROR:
372 		return ("INTERNAL_ERROR");
373 	case MLXCX_EVENT_PORT_STATE:
374 		return ("PORT_STATE");
375 	case MLXCX_EVENT_GPIO:
376 		return ("GPIO");
377 	case MLXCX_EVENT_PORT_MODULE:
378 		return ("PORT_MODULE");
379 	case MLXCX_EVENT_TEMP_WARNING:
380 		return ("TEMP_WARNING");
381 	case MLXCX_EVENT_REMOTE_CONFIG:
382 		return ("REMOTE_CONFIG");
383 	case MLXCX_EVENT_DCBX_CHANGE:
384 		return ("DCBX_CHANGE");
385 	case MLXCX_EVENT_DOORBELL_CONGEST:
386 		return ("DOORBELL_CONGEST");
387 	case MLXCX_EVENT_STALL_VL:
388 		return ("STALL_VL");
389 	case MLXCX_EVENT_CMD_COMPLETION:
390 		return ("CMD_COMPLETION");
391 	case MLXCX_EVENT_PAGE_REQUEST:
392 		return ("PAGE_REQUEST");
393 	case MLXCX_EVENT_NIC_VPORT:
394 		return ("NIC_VPORT");
395 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
396 		return ("EC_PARAMS_CHANGE");
397 	case MLXCX_EVENT_XRQ_ERROR:
398 		return ("XRQ_ERROR");
399 	}
400 	return ("UNKNOWN");
401 }
402 
403 /* Should be called only when link state has changed. */
404 void
405 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
406 {
407 	link_state_t ls;
408 
409 	mutex_enter(&port->mlp_mtx);
410 	(void) mlxcx_cmd_query_port_status(mlxp, port);
411 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
412 	(void) mlxcx_cmd_query_port_fec(mlxp, port);
413 
414 	switch (port->mlp_oper_status) {
415 	case MLXCX_PORT_STATUS_UP:
416 	case MLXCX_PORT_STATUS_UP_ONCE:
417 		ls = LINK_STATE_UP;
418 		break;
419 	case MLXCX_PORT_STATUS_DOWN:
420 		ls = LINK_STATE_DOWN;
421 		break;
422 	default:
423 		ls = LINK_STATE_UNKNOWN;
424 	}
425 
426 	if (mlxp->mlx_mac_hdl != NULL)
427 		mac_link_update(mlxp->mlx_mac_hdl, ls);
428 
429 	mutex_exit(&port->mlp_mtx);
430 }
431 
432 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX);
433 
434 static void
435 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
436 {
437 	ddi_device_acc_attr_t acc;
438 	ddi_dma_attr_t attr;
439 	mlxcx_dev_page_t *mdp;
440 	mlxcx_dev_page_t **pages;
441 	size_t i;
442 	const ddi_dma_cookie_t *ck;
443 
444 	/*
445 	 * If this isn't enough, the HCA will ask for more
446 	 */
447 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
448 
449 	pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP);
450 
451 	for (i = 0; i < npages; i++) {
452 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
453 		mlxcx_dma_acc_attr(mlxp, &acc);
454 		mlxcx_dma_page_attr(mlxp, &attr);
455 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
456 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
457 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i,
458 			    npages);
459 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
460 			goto cleanup_npages;
461 		}
462 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
463 		mdp->mxdp_pa = ck->dmac_laddress;
464 		pages[i] = mdp;
465 	}
466 
467 	mutex_enter(&mlxp->mlx_pagemtx);
468 
469 	if (!mlxcx_cmd_give_pages(mlxp,
470 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
471 		mlxcx_warn(mlxp, "!hardware refused our gift of %lu "
472 		    "pages!", npages);
473 		mutex_exit(&mlxp->mlx_pagemtx);
474 		goto cleanup_npages;
475 	}
476 
477 	for (i = 0; i < npages; i++) {
478 		avl_add(&mlxp->mlx_pages, pages[i]);
479 	}
480 	mlxp->mlx_npages += npages;
481 	mutex_exit(&mlxp->mlx_pagemtx);
482 
483 	kmem_free(pages, sizeof (*pages) * npages);
484 
485 	return;
486 
487 cleanup_npages:
488 	for (i = 0; i < npages; i++) {
489 		if ((mdp = pages[i]) == NULL)
490 			break;
491 
492 		mlxcx_dma_free(&mdp->mxdp_dma);
493 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
494 	}
495 	/* Tell the hardware we had an allocation failure. */
496 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
497 	    0, NULL);
498 	mutex_exit(&mlxp->mlx_pagemtx);
499 
500 	kmem_free(pages, sizeof (*pages) * npages);
501 }
502 
503 static void
504 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
505 {
506 	uint_t i;
507 	int32_t ret;
508 	uint64_t *pas;
509 	mlxcx_dev_page_t *mdp, probe;
510 
511 	pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP);
512 
513 	if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) {
514 		kmem_free(pas, sizeof (*pas) * npages);
515 		return;
516 	}
517 
518 	mutex_enter(&mlxp->mlx_pagemtx);
519 
520 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
521 
522 	for (i = 0; i < ret; i++) {
523 		bzero(&probe, sizeof (probe));
524 		probe.mxdp_pa = pas[i];
525 
526 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
527 
528 		if (mdp != NULL) {
529 			avl_remove(&mlxp->mlx_pages, mdp);
530 			mlxp->mlx_npages--;
531 			mlxcx_dma_free(&mdp->mxdp_dma);
532 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
533 		} else {
534 			mlxcx_warn(mlxp, "hardware returned a page "
535 			    "with PA 0x%" PRIx64 " but we have no "
536 			    "record of giving out such a page", pas[i]);
537 		}
538 	}
539 
540 	mutex_exit(&mlxp->mlx_pagemtx);
541 
542 	kmem_free(pas, sizeof (*pas) * npages);
543 }
544 
545 static void
546 mlxcx_pages_task(void *arg)
547 {
548 	mlxcx_async_param_t *param = arg;
549 	mlxcx_t *mlxp = param->mla_mlx;
550 	int32_t npages;
551 
552 	/*
553 	 * We can drop the pending status now, as we've extracted what
554 	 * is needed to process the pages request.
555 	 *
556 	 * Even though we should never get another pages request until
557 	 * we have responded to this, along with the guard in mlxcx_sync_intr,
558 	 * this safely allows the reuse of mlxcx_async_param_t.
559 	 */
560 	mutex_enter(&param->mla_mtx);
561 	npages = param->mla_pages.mlp_npages;
562 	param->mla_pending = B_FALSE;
563 	bzero(&param->mla_pages, sizeof (param->mla_pages));
564 	mutex_exit(&param->mla_mtx);
565 
566 	/*
567 	 * The PRM describes npages as: "Number of missing / unneeded pages
568 	 * (signed number, msb indicate sign)". The implication is that
569 	 * it will not be zero. We are expected to use this to give or
570 	 * take back pages (based on the sign) using the MANAGE_PAGES
571 	 * command but we can't determine whether to give or take
572 	 * when npages is zero. So we do nothing.
573 	 */
574 	if (npages > 0) {
575 		mlxcx_give_pages_once(mlxp, npages);
576 	} else if (npages < 0) {
577 		mlxcx_take_pages_once(mlxp, -1 * npages);
578 	}
579 }
580 
581 static void
582 mlxcx_link_state_task(void *arg)
583 {
584 	mlxcx_async_param_t *param = arg;
585 	mlxcx_port_t *port;
586 	mlxcx_t *mlxp;
587 
588 	/*
589 	 * Gather the argruments from the parameters and clear the
590 	 * pending status.
591 	 *
592 	 * The pending status must be cleared *before* we update the
593 	 * link state. This is both safe and required to ensure we always
594 	 * have the correct link state. It is safe because taskq_ents are
595 	 * reusable (by the caller of taskq_dispatch_ent()) once the
596 	 * task function has started executing. It is necessarily before
597 	 * updating the link state to guarantee further link state change
598 	 * events are not missed and we always have the current link state.
599 	 */
600 	mutex_enter(&param->mla_mtx);
601 	mlxp = param->mla_mlx;
602 	port = param->mla_port;
603 	param->mla_pending = B_FALSE;
604 	mutex_exit(&param->mla_mtx);
605 
606 	mlxcx_update_link_state(mlxp, port);
607 }
608 
609 static const char *
610 mlxcx_module_error_string(mlxcx_module_error_type_t err)
611 {
612 	switch (err) {
613 	case MLXCX_MODULE_ERR_POWER_BUDGET:
614 		return ("POWER_BUDGET");
615 	case MLXCX_MODULE_ERR_LONG_RANGE:
616 		return ("LONG_RANGE");
617 	case MLXCX_MODULE_ERR_BUS_STUCK:
618 		return ("BUS_STUCK");
619 	case MLXCX_MODULE_ERR_NO_EEPROM:
620 		return ("NO_EEPROM");
621 	case MLXCX_MODULE_ERR_ENFORCEMENT:
622 		return ("ENFORCEMENT");
623 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
624 		return ("UNKNOWN_IDENT");
625 	case MLXCX_MODULE_ERR_HIGH_TEMP:
626 		return ("HIGH_TEMP");
627 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
628 		return ("CABLE_SHORTED");
629 	default:
630 		return ("UNKNOWN");
631 	}
632 }
633 
634 static void
635 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
636 {
637 	uint64_t ena;
638 	char buf[FM_MAX_CLASS];
639 	const char *lename;
640 	const char *ename;
641 	const char *stname;
642 	uint_t eno = 0;
643 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
644 
645 	switch (state) {
646 	case MLXCX_MODULE_ERROR:
647 		stname = "error";
648 		eno = evd->mled_port_mod_error_type;
649 		lename = mlxcx_module_error_string(eno);
650 		switch (eno) {
651 		case MLXCX_MODULE_ERR_ENFORCEMENT:
652 			ename = DDI_FM_TXR_ERROR_WHITELIST;
653 			break;
654 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
655 		case MLXCX_MODULE_ERR_NO_EEPROM:
656 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
657 			break;
658 		case MLXCX_MODULE_ERR_HIGH_TEMP:
659 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
660 			break;
661 		case MLXCX_MODULE_ERR_POWER_BUDGET:
662 		case MLXCX_MODULE_ERR_LONG_RANGE:
663 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
664 			ename = DDI_FM_TXR_ERROR_HWFAIL;
665 			break;
666 		case MLXCX_MODULE_ERR_BUS_STUCK:
667 		default:
668 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
669 		}
670 		break;
671 	default:
672 		return;
673 	}
674 
675 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
676 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
677 	ena = fm_ena_generate(0, FM_ENA_FMT1);
678 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
679 		return;
680 
681 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
682 	    /* compulsory FM props */
683 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
684 	    /* generic NIC txr error event props */
685 	    "error", DATA_TYPE_STRING, ename,
686 	    "port_index", DATA_TYPE_UINT8, 0,
687 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
688 	    /* local props */
689 	    "mlxcx_state", DATA_TYPE_STRING, stname,
690 	    "mlxcx_error", DATA_TYPE_STRING, lename,
691 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
692 	    NULL);
693 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
694 }
695 
696 /*
697  * Common beginning of interrupt processing.
698  * Confirm interrupt hasn't been disabled, verify its state and
699  * mark the vector as active.
700  */
701 static boolean_t
702 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
703 {
704 	mutex_enter(&mleq->mleq_mtx);
705 
706 	if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
707 		mutex_exit(&mleq->mleq_mtx);
708 		return (B_FALSE);
709 	}
710 
711 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
712 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
713 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
714 		mlxcx_warn(mlxp, "intr %d in bad eq state",
715 		    mleq->mleq_intr_index);
716 		mutex_exit(&mleq->mleq_mtx);
717 		return (B_FALSE);
718 	}
719 
720 	mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE;
721 	mutex_exit(&mleq->mleq_mtx);
722 
723 	return (B_TRUE);
724 }
725 
726 /*
727  * End of interrupt processing.
728  * Mark vector as no longer active and if shutdown is blocked on this vector,
729  * wake it up.
730  */
731 static void
732 mlxcx_intr_fini(mlxcx_event_queue_t *mleq)
733 {
734 	mutex_enter(&mleq->mleq_mtx);
735 	if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0)
736 		cv_signal(&mleq->mleq_cv);
737 
738 	mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE;
739 	mutex_exit(&mleq->mleq_mtx);
740 }
741 
742 static uint_t
743 mlxcx_intr_async(caddr_t arg, caddr_t arg2)
744 {
745 	mlxcx_t *mlxp = (mlxcx_t *)arg;
746 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
747 	mlxcx_eventq_ent_t *ent;
748 	mlxcx_async_param_t *param;
749 	uint_t portn;
750 	uint16_t func;
751 
752 	if (!mlxcx_intr_ini(mlxp, mleq))
753 		return (DDI_INTR_CLAIMED);
754 
755 	ent = mlxcx_eq_next(mleq);
756 	if (ent == NULL) {
757 		goto done;
758 	}
759 
760 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
761 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
762 
763 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
764 		DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *,
765 		    ent);
766 
767 		/*
768 		 * Handle events which can be processed while we're still in
769 		 * mlxcx_attach(). Everything on the mlxcx_t which these events
770 		 * use must be allocated and set up prior to the call to
771 		 * mlxcx_setup_async_eqs().
772 		 */
773 		switch (ent->mleqe_event_type) {
774 		case MLXCX_EVENT_CMD_COMPLETION:
775 			mlxcx_cmd_completion(mlxp, ent);
776 			continue;
777 		case MLXCX_EVENT_PAGE_REQUEST:
778 			func = from_be16(ent->mleqe_page_request.
779 			    mled_page_request_function_id);
780 			VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX);
781 
782 			param = &mlxp->mlx_npages_req[func];
783 			mutex_enter(&param->mla_mtx);
784 			if (param->mla_pending) {
785 				/*
786 				 * The PRM states we will not get another
787 				 * page request event until any pending have
788 				 * been posted as complete to the HCA.
789 				 * This will guard against this anyway.
790 				 */
791 				mutex_exit(&param->mla_mtx);
792 				mlxcx_warn(mlxp, "Unexpected page request "
793 				    "whilst another is pending");
794 				continue;
795 			}
796 			param->mla_pages.mlp_npages =
797 			    (int32_t)from_be32(ent->mleqe_page_request.
798 			    mled_page_request_num_pages);
799 			param->mla_pages.mlp_func = func;
800 			param->mla_pending = B_TRUE;
801 			ASSERT3P(param->mla_mlx, ==, mlxp);
802 			mutex_exit(&param->mla_mtx);
803 
804 			taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task,
805 			    param, 0, &param->mla_tqe);
806 			continue;
807 		}
808 
809 		/*
810 		 * All other events should be ignored while in attach.
811 		 */
812 		mutex_enter(&mleq->mleq_mtx);
813 		if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
814 			mutex_exit(&mleq->mleq_mtx);
815 			continue;
816 		}
817 		mutex_exit(&mleq->mleq_mtx);
818 
819 		switch (ent->mleqe_event_type) {
820 		case MLXCX_EVENT_PORT_STATE:
821 			portn = get_bits8(
822 			    ent->mleqe_port_state.mled_port_state_port_num,
823 			    MLXCX_EVENT_PORT_NUM) - 1;
824 			if (portn >= mlxp->mlx_nports)
825 				break;
826 
827 			param = &mlxp->mlx_ports[portn].mlx_port_event;
828 			mutex_enter(&param->mla_mtx);
829 			if (param->mla_pending) {
830 				/*
831 				 * There is a link state event pending
832 				 * processing. When that event is handled
833 				 * it will get the current link state.
834 				 */
835 				mutex_exit(&param->mla_mtx);
836 				break;
837 			}
838 
839 			ASSERT3P(param->mla_mlx, ==, mlxp);
840 			ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]);
841 
842 			param->mla_pending = B_TRUE;
843 			mutex_exit(&param->mla_mtx);
844 
845 			taskq_dispatch_ent(mlxp->mlx_async_tq,
846 			    mlxcx_link_state_task, param, 0, &param->mla_tqe);
847 			break;
848 		case MLXCX_EVENT_PORT_MODULE:
849 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
850 			break;
851 		default:
852 			mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d",
853 			    ent->mleqe_event_type, mleq->mleq_intr_index);
854 		}
855 	}
856 
857 	mlxcx_arm_eq(mlxp, mleq);
858 
859 done:
860 	mlxcx_intr_fini(mleq);
861 	return (DDI_INTR_CLAIMED);
862 }
863 
864 static boolean_t
865 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
866     size_t bytelim)
867 {
868 	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
869 	mlxcx_completionq_ent_t *cent;
870 	mblk_t *mp, *cmp, *nmp;
871 	mlxcx_buffer_t *buf;
872 	boolean_t found, added;
873 	size_t bytes = 0;
874 	uint_t rx_frames = 0;
875 	uint_t comp_cnt = 0;
876 	int64_t wqebbs, bufcnt;
877 
878 	*mpp = NULL;
879 
880 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
881 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
882 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
883 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
884 		return (B_FALSE);
885 	}
886 
887 	nmp = cmp = mp = NULL;
888 
889 	wqebbs = 0;
890 	bufcnt = 0;
891 	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
892 	    cent = mlxcx_cq_next(mlcq)) {
893 		/*
894 		 * Teardown and ring stop can atomic_or this flag
895 		 * into our state if they want us to stop early.
896 		 */
897 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
898 			return (B_FALSE);
899 
900 		comp_cnt++;
901 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
902 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
903 			/* NOP */
904 			atomic_dec_64(&wq->mlwq_wqebb_used);
905 			goto nextcq;
906 		}
907 
908 lookagain:
909 		/*
910 		 * Generally the buffer we're looking for will be
911 		 * at the front of the list, so this loop won't
912 		 * need to look far.
913 		 */
914 		buf = list_head(&mlcq->mlcq_buffers);
915 		found = B_FALSE;
916 		while (buf != NULL) {
917 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
918 			    from_be16(cent->mlcqe_wqe_counter)) {
919 				found = B_TRUE;
920 				break;
921 			}
922 			buf = list_next(&mlcq->mlcq_buffers, buf);
923 		}
924 
925 		if (!found) {
926 			/*
927 			 * If there's any buffers waiting on the
928 			 * buffers_b list, then merge those into
929 			 * the main list and have another look.
930 			 *
931 			 * The wq enqueue routines push new buffers
932 			 * into buffers_b so that they can avoid
933 			 * taking the mlcq_mtx and blocking us for
934 			 * every single packet.
935 			 */
936 			added = B_FALSE;
937 			mutex_enter(&mlcq->mlcq_bufbmtx);
938 			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
939 				list_move_tail(&mlcq->mlcq_buffers,
940 				    &mlcq->mlcq_buffers_b);
941 				added = B_TRUE;
942 			}
943 			mutex_exit(&mlcq->mlcq_bufbmtx);
944 			if (added)
945 				goto lookagain;
946 
947 			/*
948 			 * This check could go just after the lookagain
949 			 * label, but it is a hot code path so we don't
950 			 * want to unnecessarily grab a lock and check
951 			 * a flag for a relatively rare event (the ring
952 			 * being stopped).
953 			 */
954 			mutex_enter(&wq->mlwq_mtx);
955 			if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
956 				mutex_exit(&wq->mlwq_mtx);
957 				goto nextcq;
958 			}
959 			mutex_exit(&wq->mlwq_mtx);
960 
961 			buf = list_head(&mlcq->mlcq_buffers);
962 			mlxcx_warn(mlxp, "got completion on CQ %x but "
963 			    "no buffer matching wqe found: %x (first "
964 			    "buffer counter = %x)", mlcq->mlcq_num,
965 			    from_be16(cent->mlcqe_wqe_counter),
966 			    buf == NULL ? UINT32_MAX :
967 			    buf->mlb_wqe_index);
968 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
969 			goto nextcq;
970 		}
971 
972 		/*
973 		 * The buf is likely to be freed below, count this now.
974 		 */
975 		wqebbs += buf->mlb_wqebbs;
976 
977 		list_remove(&mlcq->mlcq_buffers, buf);
978 		bufcnt++;
979 
980 		switch (mlcq->mlcq_wq->mlwq_type) {
981 		case MLXCX_WQ_TYPE_SENDQ:
982 			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
983 			break;
984 		case MLXCX_WQ_TYPE_RECVQ:
985 			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
986 			bytes += from_be32(cent->mlcqe_byte_cnt);
987 			if (nmp != NULL) {
988 				if (cmp != NULL) {
989 					cmp->b_next = nmp;
990 					cmp = nmp;
991 				} else {
992 					mp = cmp = nmp;
993 				}
994 
995 				rx_frames++;
996 			}
997 			break;
998 		}
999 
1000 		/*
1001 		 * Update the consumer index with what has been processed,
1002 		 * followed by driver counters. It is important to tell the
1003 		 * hardware first, otherwise when we throw more packets at
1004 		 * it, it may get an overflow error.
1005 		 * We do this whenever we've processed enough to bridge the
1006 		 * high->low water mark.
1007 		 */
1008 		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
1009 			mlxcx_update_cqci(mlxp, mlcq);
1010 			/*
1011 			 * Both these variables are incremented using
1012 			 * atomics as they are modified in other code paths
1013 			 * (Eg during tx) which hold different locks.
1014 			 */
1015 			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1016 			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1017 			wqebbs = 0;
1018 			bufcnt = 0;
1019 			comp_cnt = 0;
1020 		}
1021 nextcq:
1022 		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
1023 		    (bytelim != 0 && bytes > bytelim))
1024 			break;
1025 	}
1026 
1027 	if (comp_cnt > 0) {
1028 		mlxcx_update_cqci(mlxp, mlcq);
1029 		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
1030 		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
1031 	}
1032 
1033 	*mpp = mp;
1034 	return (B_TRUE);
1035 }
1036 
1037 
1038 mblk_t *
1039 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
1040 {
1041 	mblk_t *mp = NULL;
1042 
1043 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
1044 
1045 	ASSERT(mlcq->mlcq_wq != NULL);
1046 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
1047 
1048 	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
1049 
1050 	return (mp);
1051 }
1052 
1053 static uint_t
1054 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
1055 {
1056 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1057 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
1058 	mlxcx_eventq_ent_t *ent;
1059 	mlxcx_completion_queue_t *mlcq, probe;
1060 	mlxcx_work_queue_t *mlwq;
1061 	mblk_t *mp = NULL;
1062 	boolean_t tellmac = B_FALSE;
1063 
1064 	if (!mlxcx_intr_ini(mlxp, mleq))
1065 		return (DDI_INTR_CLAIMED);
1066 
1067 	ent = mlxcx_eq_next(mleq);
1068 	if (ent == NULL) {
1069 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
1070 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
1071 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1072 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1073 			    mleq->mleq_intr_index]);
1074 		}
1075 		goto done;
1076 	}
1077 	mleq->mleq_badintrs = 0;
1078 
1079 	mutex_enter(&mleq->mleq_mtx);
1080 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
1081 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
1082 #if defined(DEBUG)
1083 	/*
1084 	 * If we're still in mlxcx_attach and an intr_n fired, something really
1085 	 * weird is going on. This shouldn't happen in the absence of a driver
1086 	 * or firmware bug, so in the interests of minimizing branches in this
1087 	 * function this check is under DEBUG.
1088 	 */
1089 	if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
1090 		mutex_exit(&mleq->mleq_mtx);
1091 		mlxcx_warn(mlxp, "intr_n (%u) fired during attach, disabling "
1092 		    "vector", mleq->mleq_intr_index);
1093 		mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
1094 		ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
1095 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[
1096 		    mleq->mleq_intr_index]);
1097 		goto done;
1098 	}
1099 #endif
1100 	mutex_exit(&mleq->mleq_mtx);
1101 
1102 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
1103 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
1104 
1105 		probe.mlcq_num =
1106 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
1107 		mutex_enter(&mleq->mleq_mtx);
1108 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
1109 		mutex_exit(&mleq->mleq_mtx);
1110 
1111 		if (mlcq == NULL)
1112 			goto update_eq;
1113 
1114 		mlwq = mlcq->mlcq_wq;
1115 
1116 		/*
1117 		 * mlcq_arm_mtx is used to avoid race conditions between
1118 		 * this interrupt routine and the transition from polling
1119 		 * back to interrupt mode. When exiting poll mode the
1120 		 * CQ is likely to be un-armed, which means there will
1121 		 * be no events for the CQ coming though here,
1122 		 * consequently very low contention on mlcq_arm_mtx.
1123 		 *
1124 		 * mlcq_arm_mtx must be released before calls into mac
1125 		 * layer in order to avoid deadlocks.
1126 		 */
1127 		mutex_enter(&mlcq->mlcq_arm_mtx);
1128 		mlcq->mlcq_ec++;
1129 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
1130 
1131 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
1132 			/*
1133 			 * If we failed to take the mutex because the
1134 			 * polling function has it, just move on.
1135 			 * We don't want to block other CQs behind
1136 			 * this one.
1137 			 */
1138 			if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) {
1139 				mutex_exit(&mlcq->mlcq_arm_mtx);
1140 				goto update_eq;
1141 			}
1142 
1143 			/* Otherwise we will wait. */
1144 			mutex_enter(&mlcq->mlcq_mtx);
1145 		}
1146 
1147 		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
1148 		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
1149 			/*
1150 			 * The ring is not in polling mode and we processed
1151 			 * some completion queue entries.
1152 			 */
1153 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
1154 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
1155 				atomic_and_uint(&mlcq->mlcq_state,
1156 				    ~MLXCX_CQ_BLOCKED_MAC);
1157 				tellmac = B_TRUE;
1158 			}
1159 
1160 			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
1161 			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
1162 				atomic_and_uint(&mlwq->mlwq_state,
1163 				    ~MLXCX_WQ_BLOCKED_MAC);
1164 				tellmac = B_TRUE;
1165 			}
1166 
1167 			mlxcx_arm_cq(mlxp, mlcq);
1168 
1169 			mutex_exit(&mlcq->mlcq_mtx);
1170 			mutex_exit(&mlcq->mlcq_arm_mtx);
1171 
1172 			if (tellmac) {
1173 				mac_tx_ring_update(mlxp->mlx_mac_hdl,
1174 				    mlcq->mlcq_mac_hdl);
1175 				tellmac = B_FALSE;
1176 			}
1177 
1178 			if (mp != NULL) {
1179 				mac_rx_ring(mlxp->mlx_mac_hdl,
1180 				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
1181 			}
1182 		} else {
1183 			mutex_exit(&mlcq->mlcq_mtx);
1184 			mutex_exit(&mlcq->mlcq_arm_mtx);
1185 		}
1186 
1187 update_eq:
1188 		/*
1189 		 * Updating the consumer counter for an EQ requires a write
1190 		 * to the UAR, which is possibly expensive.
1191 		 *
1192 		 * Try to do it only often enough to stop us wrapping around.
1193 		 */
1194 		if ((mleq->mleq_cc & 0x7) == 0)
1195 			mlxcx_update_eq(mlxp, mleq);
1196 	}
1197 
1198 	mlxcx_arm_eq(mlxp, mleq);
1199 
1200 done:
1201 	mlxcx_intr_fini(mleq);
1202 	return (DDI_INTR_CLAIMED);
1203 }
1204 
1205 boolean_t
1206 mlxcx_intr_setup(mlxcx_t *mlxp)
1207 {
1208 	dev_info_t *dip = mlxp->mlx_dip;
1209 	int ret;
1210 	int nintrs = 0;
1211 	int navail = 0;
1212 	int types, i;
1213 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
1214 
1215 	ret = ddi_intr_get_supported_types(dip, &types);
1216 	if (ret != DDI_SUCCESS) {
1217 		mlxcx_warn(mlxp, "Failed to get supported interrupt types");
1218 		return (B_FALSE);
1219 	}
1220 
1221 	if (!(types & DDI_INTR_TYPE_MSIX)) {
1222 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
1223 		    "requires MSI-X");
1224 		return (B_FALSE);
1225 	}
1226 
1227 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
1228 	if (ret != DDI_SUCCESS) {
1229 		mlxcx_warn(mlxp, "Failed to get number of interrupts");
1230 		return (B_FALSE);
1231 	}
1232 	if (nintrs < 2) {
1233 		mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx "
1234 		    "requires 2", nintrs);
1235 		return (B_FALSE);
1236 	}
1237 
1238 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
1239 	if (ret != DDI_SUCCESS) {
1240 		mlxcx_warn(mlxp,
1241 		    "Failed to get number of available interrupts");
1242 		return (B_FALSE);
1243 	}
1244 	if (navail < 2) {
1245 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
1246 		    "requires 2", navail);
1247 		return (B_FALSE);
1248 	}
1249 
1250 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
1251 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
1252 	/*
1253 	 * Interrupts for Completion Queues events start from vector 1
1254 	 * up to available vectors. Vector 0 is used for asynchronous
1255 	 * events.
1256 	 */
1257 	mlxp->mlx_intr_cq0 = 1;
1258 
1259 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
1260 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
1261 	if (ret != DDI_SUCCESS) {
1262 		mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
1263 		mlxcx_intr_teardown(mlxp);
1264 		return (B_FALSE);
1265 	}
1266 	if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
1267 		mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
1268 		    "requires %d", mlxp->mlx_intr_count,
1269 		    mlxp->mlx_intr_cq0 + 1);
1270 		mlxcx_intr_teardown(mlxp);
1271 		return (B_FALSE);
1272 	}
1273 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
1274 
1275 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
1276 	if (ret != DDI_SUCCESS) {
1277 		mlxcx_warn(mlxp, "Failed to get interrupt priority");
1278 		mlxcx_intr_teardown(mlxp);
1279 		return (B_FALSE);
1280 	}
1281 
1282 	/*
1283 	 * Set the interrupt priority for the asynchronous handler higher
1284 	 * than the ring handlers. Some operations which issue commands,
1285 	 * and thus rely on the async interrupt handler for posting
1286 	 * completion, do so with a CQ mutex held. The CQ mutex is also
1287 	 * acquired during ring processing, so if the ring processing vector
1288 	 * happens to be assigned to the same CPU as the async vector
1289 	 * it can hold off the async interrupt thread and lead to a deadlock.
1290 	 * By assigning a higher priority to the async vector, it will
1291 	 * always be dispatched.
1292 	 */
1293 	mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
1294 	if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
1295 		mlxp->mlx_async_intr_pri++;
1296 	} else {
1297 		mlxp->mlx_intr_pri--;
1298 	}
1299 
1300 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1301 	    sizeof (mlxcx_event_queue_t);
1302 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1303 
1304 	/*
1305 	 * In the failure path, mlxcx_intr_teardown() expects this
1306 	 * mutex and avl tree to be init'ed - so do it now.
1307 	 */
1308 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1309 		uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
1310 		    mlxp->mlx_intr_pri;
1311 
1312 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1313 		    DDI_INTR_PRI(pri));
1314 		cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);
1315 
1316 		if (i < mlxp->mlx_intr_cq0)
1317 			continue;
1318 
1319 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1320 		    sizeof (mlxcx_completion_queue_t),
1321 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1322 	}
1323 
1324 	while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) {
1325 		ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
1326 		    mlxp->mlx_async_intr_pri);
1327 		if (ret == DDI_SUCCESS)
1328 			break;
1329 		mlxcx_note(mlxp,
1330 		    "!Failed to set interrupt priority to %u for "
1331 		    "async interrupt vector", mlxp->mlx_async_intr_pri);
1332 		/*
1333 		 * If it was not possible to set the IPL for the async
1334 		 * interrupt to the desired value, then try a lower priority.
1335 		 * Some PSMs can only accommodate a limited number of vectors
1336 		 * at eatch priority level (or group of priority levels). Since
1337 		 * the async priority must be set higher than the ring
1338 		 * handlers, lower both. The ring handler priority is set
1339 		 * below.
1340 		 */
1341 		mlxp->mlx_async_intr_pri--;
1342 		mlxp->mlx_intr_pri--;
1343 	}
1344 
1345 	if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) {
1346 		mlxcx_warn(mlxp, "Failed to find an interrupt priority for "
1347 		    "async interrupt vector");
1348 		mlxcx_intr_teardown(mlxp);
1349 		return (B_FALSE);
1350 	}
1351 
1352 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
1353 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1354 	if (ret != DDI_SUCCESS) {
1355 		mlxcx_warn(mlxp, "Failed to add async interrupt handler");
1356 		mlxcx_intr_teardown(mlxp);
1357 		return (B_FALSE);
1358 	}
1359 
1360 	/*
1361 	 * If we have enough interrupts, set their "type" fields so that we
1362 	 * avoid mixing RX and TX queues on the same EQs.
1363 	 */
1364 	if (mlxp->mlx_intr_count >= 8) {
1365 		eqt = MLXCX_EQ_TYPE_RX;
1366 	}
1367 
1368 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
1369 		mlxp->mlx_eqs[i].mleq_intr_index = i;
1370 
1371 		mlxp->mlx_eqs[i].mleq_type = eqt;
1372 		/*
1373 		 * If eqt is still ANY, just leave it set to that
1374 		 * (no else here).
1375 		 */
1376 		if (eqt == MLXCX_EQ_TYPE_RX) {
1377 			eqt = MLXCX_EQ_TYPE_TX;
1378 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
1379 			eqt = MLXCX_EQ_TYPE_RX;
1380 		}
1381 
1382 		while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) {
1383 			ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
1384 			    mlxp->mlx_intr_pri);
1385 			if (ret == DDI_SUCCESS)
1386 				break;
1387 			mlxcx_note(mlxp, "!Failed to set interrupt priority to "
1388 			    "%u for interrupt vector %d",
1389 			    mlxp->mlx_intr_pri, i);
1390 			mlxp->mlx_intr_pri--;
1391 		}
1392 		if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) {
1393 			mlxcx_warn(mlxp,
1394 			    "Failed to find an interrupt priority for "
1395 			    "interrupt vector %d", i);
1396 			mlxcx_intr_teardown(mlxp);
1397 			return (B_FALSE);
1398 		}
1399 
1400 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1401 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1402 		if (ret != DDI_SUCCESS) {
1403 			mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
1404 			    i);
1405 			mlxcx_intr_teardown(mlxp);
1406 			return (B_FALSE);
1407 		}
1408 	}
1409 
1410 	return (B_TRUE);
1411 }
1412