xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision cd61ae21816e53b94bc1673f3f1aa651fc3115e8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2020, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 #include <sys/modctl.h>
22 #include <sys/conf.h>
23 #include <sys/devops.h>
24 #include <sys/sysmacros.h>
25 
26 #include <sys/mac_provider.h>
27 
28 #include <mlxcx.h>
29 
30 /*
31  * CTASSERT(s) to cover bad values which would induce bugs.
32  */
33 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
34 
35 void
36 mlxcx_intr_teardown(mlxcx_t *mlxp)
37 {
38 	int i;
39 	int ret;
40 
41 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
42 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
43 		mutex_enter(&mleq->mleq_mtx);
44 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
45 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
46 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
47 		if (i != 0) {
48 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
49 			avl_destroy(&mleq->mleq_cqs);
50 		}
51 		mutex_exit(&mleq->mleq_mtx);
52 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
53 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
54 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
55 		if (ret != DDI_SUCCESS) {
56 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
57 			    i, ret);
58 		}
59 		mutex_destroy(&mleq->mleq_mtx);
60 	}
61 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
62 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
63 	mlxp->mlx_intr_handles = NULL;
64 	mlxp->mlx_eqs = NULL;
65 }
66 
67 /*
68  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
69  */
70 static mlxcx_eventq_ent_t *
71 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
72 {
73 	mlxcx_eventq_ent_t *ent;
74 	ddi_fm_error_t err;
75 	uint_t ci;
76 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
77 
78 	ASSERT(mutex_owned(&mleq->mleq_mtx));
79 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
80 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
81 
82 	/* mleq_nents is always a power of 2 */
83 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
84 
85 	ent = &mleq->mleq_ent[ci];
86 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
87 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
88 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
89 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
90 	    DDI_FME_VERSION);
91 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
92 		/* The PRM says we have to membar here, so we're doing it */
93 		membar_consumer();
94 		++mleq->mleq_cc;
95 		return (ent);
96 	}
97 	/*
98 	 * In the case of a DMA error, we should re-arm this EQ and then come
99 	 * back and try again when the device wakes us back up.
100 	 *
101 	 * Hopefully the fault will be gone by then.
102 	 */
103 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
104 
105 	return (NULL);
106 }
107 
108 void
109 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
110 {
111 	uint_t try = 0;
112 	ddi_fm_error_t err;
113 	bits32_t v = new_bits32();
114 
115 	ASSERT(mutex_owned(&mleq->mleq_mtx));
116 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
117 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
118 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
119 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
120 
121 	mleq->mleq_state |= MLXCX_EQ_ARMED;
122 	mleq->mleq_cc_armed = mleq->mleq_cc;
123 
124 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
125 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
126 
127 retry:
128 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
129 	    from_bits32(v));
130 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
131 	    DDI_FME_VERSION);
132 	if (err.fme_status == DDI_FM_OK)
133 		return;
134 	if (try++ < mlxcx_doorbell_tries) {
135 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
136 		goto retry;
137 	}
138 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
139 }
140 
141 static void
142 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
143 {
144 	bits32_t v = new_bits32();
145 	ddi_fm_error_t err;
146 
147 	ASSERT(mutex_owned(&mleq->mleq_mtx));
148 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
149 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
150 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
151 
152 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
153 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
154 
155 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
156 	    from_bits32(v));
157 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
158 	    DDI_FME_VERSION);
159 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
160 	/*
161 	 * Ignore the error, if it's still happening when we try to re-arm the
162 	 * EQ, we will note the impact then.
163 	 */
164 }
165 
166 static mlxcx_completionq_ent_t *
167 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
168 {
169 	mlxcx_completionq_ent_t *ent;
170 	ddi_fm_error_t err;
171 	uint_t ci;
172 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
173 
174 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
175 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
176 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177 
178 	/* mlcq_nents is always a power of 2 */
179 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
180 
181 	ent = &mlcq->mlcq_ent[ci];
182 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
183 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
184 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
185 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
186 	    DDI_FME_VERSION);
187 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
188 		/* The PRM says we have to membar here, so we're doing it */
189 		membar_consumer();
190 		++mlcq->mlcq_cc;
191 		return (ent);
192 	}
193 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
194 
195 	return (NULL);
196 }
197 
198 void
199 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
200 {
201 	ddi_fm_error_t err;
202 	uint_t try = 0;
203 
204 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
205 
206 retry:
207 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
208 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
209 	    DDI_FME_VERSION);
210 	if (err.fme_status != DDI_FM_OK) {
211 		if (try++ < mlxcx_doorbell_tries) {
212 			ddi_fm_dma_err_clear(
213 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
214 			    DDI_FME_VERSION);
215 			goto retry;
216 		} else {
217 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
218 			return;
219 		}
220 	}
221 }
222 
223 void
224 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
225 {
226 	bits32_t dbval = new_bits32();
227 	uint64_t udbval;
228 	ddi_fm_error_t err;
229 	uint_t try = 0;
230 
231 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
232 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
233 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
234 
235 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED)
236 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
237 
238 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
239 		return;
240 
241 	mlcq->mlcq_state |= MLXCX_CQ_ARMED;
242 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
243 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
244 
245 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
246 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
247 
248 	udbval = (uint64_t)from_bits32(dbval) << 32;
249 	udbval |= mlcq->mlcq_num & 0xffffff;
250 
251 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
252 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
253 
254 retry:
255 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
256 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
257 	    DDI_FME_VERSION);
258 	if (err.fme_status != DDI_FM_OK) {
259 		if (try++ < mlxcx_doorbell_tries) {
260 			ddi_fm_dma_err_clear(
261 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
262 			    DDI_FME_VERSION);
263 			goto retry;
264 		} else {
265 			goto err;
266 		}
267 	}
268 
269 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
270 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
271 	    DDI_FME_VERSION);
272 	if (err.fme_status == DDI_FM_OK)
273 		return;
274 	if (try++ < mlxcx_doorbell_tries) {
275 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
276 		goto retry;
277 	}
278 
279 err:
280 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
281 }
282 
283 const char *
284 mlxcx_event_name(mlxcx_event_t evt)
285 {
286 	switch (evt) {
287 	case MLXCX_EVENT_COMPLETION:
288 		return ("COMPLETION");
289 	case MLXCX_EVENT_PATH_MIGRATED:
290 		return ("PATH_MIGRATED");
291 	case MLXCX_EVENT_COMM_ESTABLISH:
292 		return ("COMM_ESTABLISH");
293 	case MLXCX_EVENT_SENDQ_DRAIN:
294 		return ("SENDQ_DRAIN");
295 	case MLXCX_EVENT_LAST_WQE:
296 		return ("LAST_WQE");
297 	case MLXCX_EVENT_SRQ_LIMIT:
298 		return ("SRQ_LIMIT");
299 	case MLXCX_EVENT_DCT_ALL_CLOSED:
300 		return ("DCT_ALL_CLOSED");
301 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
302 		return ("DCT_ACCKEY_VIOL");
303 	case MLXCX_EVENT_CQ_ERROR:
304 		return ("CQ_ERROR");
305 	case MLXCX_EVENT_WQ_CATASTROPHE:
306 		return ("WQ_CATASTROPHE");
307 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
308 		return ("PATH_MIGRATE_FAIL");
309 	case MLXCX_EVENT_PAGE_FAULT:
310 		return ("PAGE_FAULT");
311 	case MLXCX_EVENT_WQ_INVALID_REQ:
312 		return ("WQ_INVALID_REQ");
313 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
314 		return ("WQ_ACCESS_VIOL");
315 	case MLXCX_EVENT_SRQ_CATASTROPHE:
316 		return ("SRQ_CATASTROPHE");
317 	case MLXCX_EVENT_INTERNAL_ERROR:
318 		return ("INTERNAL_ERROR");
319 	case MLXCX_EVENT_PORT_STATE:
320 		return ("PORT_STATE");
321 	case MLXCX_EVENT_GPIO:
322 		return ("GPIO");
323 	case MLXCX_EVENT_PORT_MODULE:
324 		return ("PORT_MODULE");
325 	case MLXCX_EVENT_TEMP_WARNING:
326 		return ("TEMP_WARNING");
327 	case MLXCX_EVENT_REMOTE_CONFIG:
328 		return ("REMOTE_CONFIG");
329 	case MLXCX_EVENT_DCBX_CHANGE:
330 		return ("DCBX_CHANGE");
331 	case MLXCX_EVENT_DOORBELL_CONGEST:
332 		return ("DOORBELL_CONGEST");
333 	case MLXCX_EVENT_STALL_VL:
334 		return ("STALL_VL");
335 	case MLXCX_EVENT_CMD_COMPLETION:
336 		return ("CMD_COMPLETION");
337 	case MLXCX_EVENT_PAGE_REQUEST:
338 		return ("PAGE_REQUEST");
339 	case MLXCX_EVENT_NIC_VPORT:
340 		return ("NIC_VPORT");
341 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
342 		return ("EC_PARAMS_CHANGE");
343 	case MLXCX_EVENT_XRQ_ERROR:
344 		return ("XRQ_ERROR");
345 	}
346 	return ("UNKNOWN");
347 }
348 
349 /* Should be called only when link state has changed. */
350 void
351 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
352 {
353 	link_state_t ls;
354 
355 	mutex_enter(&port->mlp_mtx);
356 	(void) mlxcx_cmd_query_port_status(mlxp, port);
357 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
358 
359 	switch (port->mlp_oper_status) {
360 	case MLXCX_PORT_STATUS_UP:
361 	case MLXCX_PORT_STATUS_UP_ONCE:
362 		ls = LINK_STATE_UP;
363 		break;
364 	case MLXCX_PORT_STATUS_DOWN:
365 		ls = LINK_STATE_DOWN;
366 		break;
367 	default:
368 		ls = LINK_STATE_UNKNOWN;
369 	}
370 	mac_link_update(mlxp->mlx_mac_hdl, ls);
371 
372 	mutex_exit(&port->mlp_mtx);
373 }
374 
375 static void
376 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
377 {
378 	ddi_device_acc_attr_t acc;
379 	ddi_dma_attr_t attr;
380 	mlxcx_dev_page_t *mdp;
381 	int32_t togive;
382 	mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
383 	uint_t i;
384 	const ddi_dma_cookie_t *ck;
385 
386 	togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
387 
388 	for (i = 0; i < togive; i++) {
389 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
390 		mlxcx_dma_acc_attr(mlxp, &acc);
391 		mlxcx_dma_page_attr(mlxp, &attr);
392 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
393 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
394 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
395 			    togive);
396 			goto cleanup_npages;
397 		}
398 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
399 		mdp->mxdp_pa = ck->dmac_laddress;
400 		pages[i] = mdp;
401 	}
402 
403 	mutex_enter(&mlxp->mlx_pagemtx);
404 
405 	if (!mlxcx_cmd_give_pages(mlxp,
406 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
407 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
408 		    "pages!", togive);
409 		goto cleanup_npages;
410 	}
411 
412 	for (i = 0; i < togive; i++) {
413 		avl_add(&mlxp->mlx_pages, pages[i]);
414 	}
415 	mlxp->mlx_npages += togive;
416 	mutex_exit(&mlxp->mlx_pagemtx);
417 
418 	return;
419 
420 cleanup_npages:
421 	for (i = 0; i < togive; i++) {
422 		mdp = pages[i];
423 		mlxcx_dma_free(&mdp->mxdp_dma);
424 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
425 	}
426 	/* Tell the hardware we had an allocation failure. */
427 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
428 	    0, NULL);
429 	mutex_exit(&mlxp->mlx_pagemtx);
430 }
431 
432 static void
433 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
434 {
435 	uint_t i;
436 	int32_t req, ret;
437 	uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
438 	mlxcx_dev_page_t *mdp, probe;
439 
440 	mutex_enter(&mlxp->mlx_pagemtx);
441 
442 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
443 	req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
444 
445 	if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
446 		return;
447 	}
448 
449 	for (i = 0; i < ret; i++) {
450 		bzero(&probe, sizeof (probe));
451 		probe.mxdp_pa = pas[i];
452 
453 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
454 
455 		if (mdp != NULL) {
456 			avl_remove(&mlxp->mlx_pages, mdp);
457 			mlxp->mlx_npages--;
458 			mlxcx_dma_free(&mdp->mxdp_dma);
459 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
460 		} else {
461 			mlxcx_warn(mlxp, "hardware returned a page "
462 			    "with PA 0x%" PRIx64 " but we have no "
463 			    "record of giving out such a page", pas[i]);
464 		}
465 	}
466 
467 	mutex_exit(&mlxp->mlx_pagemtx);
468 }
469 
470 static const char *
471 mlxcx_module_error_string(mlxcx_module_error_type_t err)
472 {
473 	switch (err) {
474 	case MLXCX_MODULE_ERR_POWER_BUDGET:
475 		return ("POWER_BUDGET");
476 	case MLXCX_MODULE_ERR_LONG_RANGE:
477 		return ("LONG_RANGE");
478 	case MLXCX_MODULE_ERR_BUS_STUCK:
479 		return ("BUS_STUCK");
480 	case MLXCX_MODULE_ERR_NO_EEPROM:
481 		return ("NO_EEPROM");
482 	case MLXCX_MODULE_ERR_ENFORCEMENT:
483 		return ("ENFORCEMENT");
484 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
485 		return ("UNKNOWN_IDENT");
486 	case MLXCX_MODULE_ERR_HIGH_TEMP:
487 		return ("HIGH_TEMP");
488 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
489 		return ("CABLE_SHORTED");
490 	default:
491 		return ("UNKNOWN");
492 	}
493 }
494 
495 static void
496 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
497 {
498 	uint64_t ena;
499 	char buf[FM_MAX_CLASS];
500 	const char *lename;
501 	const char *ename;
502 	const char *stname;
503 	uint_t eno = 0;
504 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
505 
506 	switch (state) {
507 	case MLXCX_MODULE_ERROR:
508 		stname = "error";
509 		eno = evd->mled_port_mod_error_type;
510 		lename = mlxcx_module_error_string(eno);
511 		switch (eno) {
512 		case MLXCX_MODULE_ERR_ENFORCEMENT:
513 			ename = DDI_FM_TXR_ERROR_WHITELIST;
514 			break;
515 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
516 		case MLXCX_MODULE_ERR_NO_EEPROM:
517 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
518 			break;
519 		case MLXCX_MODULE_ERR_HIGH_TEMP:
520 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
521 			break;
522 		case MLXCX_MODULE_ERR_POWER_BUDGET:
523 		case MLXCX_MODULE_ERR_LONG_RANGE:
524 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
525 			ename = DDI_FM_TXR_ERROR_HWFAIL;
526 			break;
527 		case MLXCX_MODULE_ERR_BUS_STUCK:
528 		default:
529 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
530 		}
531 		break;
532 	default:
533 		return;
534 	}
535 
536 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
537 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
538 	ena = fm_ena_generate(0, FM_ENA_FMT1);
539 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
540 		return;
541 
542 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
543 	    /* compulsory FM props */
544 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
545 	    /* generic NIC txr error event props */
546 	    "error", DATA_TYPE_STRING, ename,
547 	    "port_index", DATA_TYPE_UINT8, 0,
548 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
549 	    /* local props */
550 	    "mlxcx_state", DATA_TYPE_STRING, stname,
551 	    "mlxcx_error", DATA_TYPE_STRING, lename,
552 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
553 	    NULL);
554 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
555 }
556 
557 static uint_t
558 mlxcx_intr_0(caddr_t arg, caddr_t arg2)
559 {
560 	mlxcx_t *mlxp = (mlxcx_t *)arg;
561 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
562 	mlxcx_eventq_ent_t *ent;
563 	mlxcx_port_t *port;
564 	uint_t portn;
565 	int32_t npages = 0;
566 
567 	mutex_enter(&mleq->mleq_mtx);
568 
569 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
570 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
571 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
572 		mlxcx_warn(mlxp, "int %d on bad eq state",
573 		    mleq->mleq_intr_index);
574 		mutex_exit(&mleq->mleq_mtx);
575 		return (DDI_INTR_UNCLAIMED);
576 	}
577 
578 	ent = mlxcx_eq_next(mleq);
579 	if (ent == NULL) {
580 		mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
581 		mutex_exit(&mleq->mleq_mtx);
582 		return (DDI_INTR_UNCLAIMED);
583 	}
584 
585 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
586 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
587 
588 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
589 		switch (ent->mleqe_event_type) {
590 		case MLXCX_EVENT_PAGE_REQUEST:
591 			VERIFY3U(from_be16(ent->mleqe_page_request.
592 			    mled_page_request_function_id), ==, 0);
593 			npages += (int32_t)from_be32(ent->mleqe_page_request.
594 			    mled_page_request_num_pages);
595 			break;
596 		case MLXCX_EVENT_PORT_STATE:
597 			portn = get_bits8(
598 			    ent->mleqe_port_state.mled_port_state_port_num,
599 			    MLXCX_EVENT_PORT_NUM) - 1;
600 			if (portn >= mlxp->mlx_nports)
601 				break;
602 			port = &mlxp->mlx_ports[portn];
603 			mlxcx_update_link_state(mlxp, port);
604 			break;
605 		case MLXCX_EVENT_PORT_MODULE:
606 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
607 			break;
608 		default:
609 			mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
610 			    ent->mleqe_event_type, mleq->mleq_intr_index);
611 		}
612 	}
613 
614 	if (npages > 0) {
615 		mlxcx_give_pages_once(mlxp, npages);
616 	} else if (npages < 0) {
617 		mlxcx_take_pages_once(mlxp, -1 * npages);
618 	}
619 
620 	mlxcx_arm_eq(mlxp, mleq);
621 	mutex_exit(&mleq->mleq_mtx);
622 
623 	return (DDI_INTR_CLAIMED);
624 }
625 
626 static boolean_t
627 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
628     size_t bytelim)
629 {
630 	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
631 	mlxcx_completionq_ent_t *cent;
632 	mblk_t *mp, *cmp, *nmp;
633 	mlxcx_buffer_t *buf;
634 	boolean_t found, added;
635 	size_t bytes = 0;
636 	uint_t rx_frames = 0;
637 	uint_t comp_cnt = 0;
638 	int64_t wqebbs, bufcnt;
639 
640 	*mpp = NULL;
641 
642 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
643 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
644 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
645 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
646 		return (B_FALSE);
647 	}
648 
649 	nmp = cmp = mp = NULL;
650 
651 	wqebbs = 0;
652 	bufcnt = 0;
653 	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
654 	    cent = mlxcx_cq_next(mlcq)) {
655 		/*
656 		 * Teardown and ring stop can atomic_or this flag
657 		 * into our state if they want us to stop early.
658 		 */
659 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
660 			return (B_FALSE);
661 
662 		comp_cnt++;
663 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
664 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
665 			/* NOP */
666 			atomic_dec_64(&wq->mlwq_wqebb_used);
667 			goto nextcq;
668 		}
669 
670 lookagain:
671 		/*
672 		 * Generally the buffer we're looking for will be
673 		 * at the front of the list, so this loop won't
674 		 * need to look far.
675 		 */
676 		buf = list_head(&mlcq->mlcq_buffers);
677 		found = B_FALSE;
678 		while (buf != NULL) {
679 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
680 			    from_be16(cent->mlcqe_wqe_counter)) {
681 				found = B_TRUE;
682 				break;
683 			}
684 			buf = list_next(&mlcq->mlcq_buffers, buf);
685 		}
686 
687 		if (!found) {
688 			/*
689 			 * If there's any buffers waiting on the
690 			 * buffers_b list, then merge those into
691 			 * the main list and have another look.
692 			 *
693 			 * The wq enqueue routines push new buffers
694 			 * into buffers_b so that they can avoid
695 			 * taking the mlcq_mtx and blocking us for
696 			 * every single packet.
697 			 */
698 			added = B_FALSE;
699 			mutex_enter(&mlcq->mlcq_bufbmtx);
700 			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
701 				list_move_tail(&mlcq->mlcq_buffers,
702 				    &mlcq->mlcq_buffers_b);
703 				added = B_TRUE;
704 			}
705 			mutex_exit(&mlcq->mlcq_bufbmtx);
706 			if (added)
707 				goto lookagain;
708 
709 			buf = list_head(&mlcq->mlcq_buffers);
710 			mlxcx_warn(mlxp, "got completion on CQ %x but "
711 			    "no buffer matching wqe found: %x (first "
712 			    "buffer counter = %x)", mlcq->mlcq_num,
713 			    from_be16(cent->mlcqe_wqe_counter),
714 			    buf == NULL ? UINT32_MAX :
715 			    buf->mlb_wqe_index);
716 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
717 			goto nextcq;
718 		}
719 
720 		/*
721 		 * The buf is likely to be freed below, count this now.
722 		 */
723 		wqebbs += buf->mlb_wqebbs;
724 
725 		list_remove(&mlcq->mlcq_buffers, buf);
726 		bufcnt++;
727 
728 		switch (mlcq->mlcq_wq->mlwq_type) {
729 		case MLXCX_WQ_TYPE_SENDQ:
730 			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
731 			break;
732 		case MLXCX_WQ_TYPE_RECVQ:
733 			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
734 			bytes += from_be32(cent->mlcqe_byte_cnt);
735 			if (nmp != NULL) {
736 				if (cmp != NULL) {
737 					cmp->b_next = nmp;
738 					cmp = nmp;
739 				} else {
740 					mp = cmp = nmp;
741 				}
742 
743 				rx_frames++;
744 			}
745 			break;
746 		}
747 
748 		/*
749 		 * Update the consumer index with what has been processed,
750 		 * followed by driver counters. It is important to tell the
751 		 * hardware first, otherwise when we throw more packets at
752 		 * it, it may get an overflow error.
753 		 * We do this whenever we've processed enough to bridge the
754 		 * high->low water mark.
755 		 */
756 		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
757 			mlxcx_update_cqci(mlxp, mlcq);
758 			/*
759 			 * Both these variables are incremented using
760 			 * atomics as they are modified in other code paths
761 			 * (Eg during tx) which hold different locks.
762 			 */
763 			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
764 			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
765 			wqebbs = 0;
766 			bufcnt = 0;
767 			comp_cnt = 0;
768 		}
769 nextcq:
770 		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
771 		    (bytelim != 0 && bytes > bytelim))
772 			break;
773 	}
774 
775 	if (comp_cnt > 0) {
776 		mlxcx_update_cqci(mlxp, mlcq);
777 		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
778 		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
779 	}
780 
781 	*mpp = mp;
782 	return (B_TRUE);
783 }
784 
785 
786 mblk_t *
787 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
788 {
789 	mblk_t *mp = NULL;
790 
791 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
792 
793 	ASSERT(mlcq->mlcq_wq != NULL);
794 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
795 
796 	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
797 
798 	return (mp);
799 }
800 
801 static uint_t
802 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
803 {
804 	mlxcx_t *mlxp = (mlxcx_t *)arg;
805 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
806 	mlxcx_eventq_ent_t *ent;
807 	mlxcx_completion_queue_t *mlcq, probe;
808 	mlxcx_work_queue_t *mlwq;
809 	mblk_t *mp = NULL;
810 	boolean_t tellmac = B_FALSE;
811 
812 	mutex_enter(&mleq->mleq_mtx);
813 
814 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
815 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
816 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
817 		mutex_exit(&mleq->mleq_mtx);
818 		return (DDI_INTR_CLAIMED);
819 	}
820 
821 	ent = mlxcx_eq_next(mleq);
822 	if (ent == NULL) {
823 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
824 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
825 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
826 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
827 			    mleq->mleq_intr_index]);
828 		}
829 		mutex_exit(&mleq->mleq_mtx);
830 		return (DDI_INTR_CLAIMED);
831 	}
832 	mleq->mleq_badintrs = 0;
833 
834 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
835 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
836 
837 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
838 		if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) {
839 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
840 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
841 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
842 			    mleq->mleq_intr_index]);
843 			mutex_exit(&mleq->mleq_mtx);
844 			return (DDI_INTR_CLAIMED);
845 		}
846 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
847 
848 		probe.mlcq_num =
849 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
850 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
851 
852 		if (mlcq == NULL)
853 			continue;
854 
855 		mlwq = mlcq->mlcq_wq;
856 
857 		/*
858 		 * The polling function might have the mutex and stop us from
859 		 * getting the lock in mlxcx_process_cq(), so we increment
860 		 * the event counter atomically from outside.
861 		 *
862 		 * This way at the end of polling when we go back to interrupts
863 		 * from this CQ, the event counter is still correct.
864 		 *
865 		 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so
866 		 * as to avoid any possibility of racing against us here, so we
867 		 * only have to consider mlxcx_rx_poll().
868 		 */
869 		atomic_inc_32(&mlcq->mlcq_ec);
870 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
871 
872 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
873 			/*
874 			 * If we failed to take the mutex because the
875 			 * polling function has it, just move on.
876 			 * We don't want to block other CQs behind
877 			 * this one.
878 			 */
879 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
880 				goto update_eq;
881 
882 			/* Otherwise we will wait. */
883 			mutex_enter(&mlcq->mlcq_mtx);
884 		}
885 
886 		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
887 		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
888 			/*
889 			 * The ring is not in polling mode and we processed
890 			 * some completion queue entries.
891 			 */
892 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
893 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
894 				atomic_and_uint(&mlcq->mlcq_state,
895 				    ~MLXCX_CQ_BLOCKED_MAC);
896 				tellmac = B_TRUE;
897 			}
898 
899 			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
900 			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
901 				atomic_and_uint(&mlwq->mlwq_state,
902 				    ~MLXCX_WQ_BLOCKED_MAC);
903 				tellmac = B_TRUE;
904 			}
905 
906 			mlxcx_arm_cq(mlxp, mlcq);
907 
908 			mutex_exit(&mlcq->mlcq_mtx);
909 
910 			if (tellmac) {
911 				mac_tx_ring_update(mlxp->mlx_mac_hdl,
912 				    mlcq->mlcq_mac_hdl);
913 				tellmac = B_FALSE;
914 			}
915 
916 			if (mp != NULL) {
917 				mac_rx_ring(mlxp->mlx_mac_hdl,
918 				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
919 			}
920 		} else {
921 			mutex_exit(&mlcq->mlcq_mtx);
922 		}
923 
924 update_eq:
925 		/*
926 		 * Updating the consumer counter for an EQ requires a write
927 		 * to the UAR, which is possibly expensive.
928 		 *
929 		 * Try to do it only often enough to stop us wrapping around.
930 		 */
931 		if ((mleq->mleq_cc & 0x7) == 0)
932 			mlxcx_update_eq(mlxp, mleq);
933 	}
934 
935 	mlxcx_arm_eq(mlxp, mleq);
936 	mutex_exit(&mleq->mleq_mtx);
937 
938 	return (DDI_INTR_CLAIMED);
939 }
940 
941 boolean_t
942 mlxcx_intr_setup(mlxcx_t *mlxp)
943 {
944 	dev_info_t *dip = mlxp->mlx_dip;
945 	int ret;
946 	int nintrs = 0;
947 	int navail = 0;
948 	int types, i;
949 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
950 
951 	ret = ddi_intr_get_supported_types(dip, &types);
952 	if (ret != DDI_SUCCESS) {
953 		return (B_FALSE);
954 	}
955 
956 	if (!(types & DDI_INTR_TYPE_MSIX)) {
957 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
958 		    "requires MSI-X");
959 		return (B_FALSE);
960 	}
961 
962 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
963 	if (ret != DDI_SUCCESS) {
964 		return (B_FALSE);
965 	}
966 	if (nintrs < 2) {
967 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
968 		    "requires 2", nintrs);
969 		return (B_FALSE);
970 	}
971 
972 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
973 	if (navail < 2) {
974 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
975 		    "requires 2", navail);
976 		return (B_FALSE);
977 	}
978 
979 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
980 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
981 
982 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
983 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
984 	if (ret != DDI_SUCCESS) {
985 		mlxcx_intr_teardown(mlxp);
986 		return (B_FALSE);
987 	}
988 	if (mlxp->mlx_intr_count < 2) {
989 		mlxcx_intr_teardown(mlxp);
990 		return (B_FALSE);
991 	}
992 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
993 
994 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
995 	if (ret != DDI_SUCCESS) {
996 		mlxcx_intr_teardown(mlxp);
997 		return (B_FALSE);
998 	}
999 
1000 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1001 	    sizeof (mlxcx_event_queue_t);
1002 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1003 
1004 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0,
1005 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1006 	if (ret != DDI_SUCCESS) {
1007 		mlxcx_intr_teardown(mlxp);
1008 		return (B_FALSE);
1009 	}
1010 
1011 	/*
1012 	 * If we have enough interrupts, set their "type" fields so that we
1013 	 * avoid mixing RX and TX queues on the same EQs.
1014 	 */
1015 	if (mlxp->mlx_intr_count >= 8) {
1016 		eqt = MLXCX_EQ_TYPE_RX;
1017 	}
1018 
1019 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
1020 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1021 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1022 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1023 		    sizeof (mlxcx_completion_queue_t),
1024 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1025 		mlxp->mlx_eqs[i].mleq_intr_index = i;
1026 
1027 		mlxp->mlx_eqs[i].mleq_type = eqt;
1028 		/*
1029 		 * If eqt is still ANY, just leave it set to that
1030 		 * (no else here).
1031 		 */
1032 		if (eqt == MLXCX_EQ_TYPE_RX) {
1033 			eqt = MLXCX_EQ_TYPE_TX;
1034 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
1035 			eqt = MLXCX_EQ_TYPE_RX;
1036 		}
1037 
1038 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1039 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1040 		if (ret != DDI_SUCCESS) {
1041 			mlxcx_intr_teardown(mlxp);
1042 			return (B_FALSE);
1043 		}
1044 	}
1045 
1046 	return (B_TRUE);
1047 }
1048