xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision 069e6b7e31ba5dcbc5441b98af272714d9a5455c)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2020, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 #include <sys/modctl.h>
22 #include <sys/conf.h>
23 #include <sys/devops.h>
24 #include <sys/sysmacros.h>
25 
26 #include <sys/mac_provider.h>
27 
28 #include <mlxcx.h>
29 
30 /*
31  * CTASSERT(s) to cover bad values which would induce bugs.
32  */
33 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
34 
35 void
36 mlxcx_intr_teardown(mlxcx_t *mlxp)
37 {
38 	int i;
39 	int ret;
40 
41 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
42 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
43 		mutex_enter(&mleq->mleq_mtx);
44 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
45 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
46 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
47 		if (i != 0) {
48 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
49 			avl_destroy(&mleq->mleq_cqs);
50 		}
51 		mutex_exit(&mleq->mleq_mtx);
52 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
53 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
54 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
55 		if (ret != DDI_SUCCESS) {
56 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
57 			    i, ret);
58 		}
59 		mutex_destroy(&mleq->mleq_mtx);
60 	}
61 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
62 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
63 	mlxp->mlx_intr_handles = NULL;
64 	mlxp->mlx_eqs = NULL;
65 }
66 
67 /*
68  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
69  */
70 static mlxcx_eventq_ent_t *
71 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
72 {
73 	mlxcx_eventq_ent_t *ent;
74 	ddi_fm_error_t err;
75 	uint_t ci;
76 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
77 
78 	ASSERT(mutex_owned(&mleq->mleq_mtx));
79 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
80 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
81 
82 	/* mleq_nents is always a power of 2 */
83 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
84 
85 	ent = &mleq->mleq_ent[ci];
86 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
87 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
88 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
89 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
90 	    DDI_FME_VERSION);
91 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
92 		/* The PRM says we have to membar here, so we're doing it */
93 		membar_consumer();
94 		++mleq->mleq_cc;
95 		return (ent);
96 	}
97 	/*
98 	 * In the case of a DMA error, we should re-arm this EQ and then come
99 	 * back and try again when the device wakes us back up.
100 	 *
101 	 * Hopefully the fault will be gone by then.
102 	 */
103 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
104 
105 	return (NULL);
106 }
107 
108 void
109 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
110 {
111 	uint_t try = 0;
112 	ddi_fm_error_t err;
113 	bits32_t v = new_bits32();
114 
115 	ASSERT(mutex_owned(&mleq->mleq_mtx));
116 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
117 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
118 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
119 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
120 
121 	mleq->mleq_state |= MLXCX_EQ_ARMED;
122 	mleq->mleq_cc_armed = mleq->mleq_cc;
123 
124 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
125 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
126 
127 retry:
128 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
129 	    from_bits32(v));
130 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
131 	    DDI_FME_VERSION);
132 	if (err.fme_status == DDI_FM_OK)
133 		return;
134 	if (try++ < mlxcx_doorbell_tries) {
135 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
136 		goto retry;
137 	}
138 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
139 }
140 
141 static void
142 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
143 {
144 	bits32_t v = new_bits32();
145 	ddi_fm_error_t err;
146 
147 	ASSERT(mutex_owned(&mleq->mleq_mtx));
148 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
149 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
150 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
151 
152 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
153 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
154 
155 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
156 	    from_bits32(v));
157 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
158 	    DDI_FME_VERSION);
159 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
160 	/*
161 	 * Ignore the error, if it's still happening when we try to re-arm the
162 	 * EQ, we will note the impact then.
163 	 */
164 }
165 
166 static mlxcx_completionq_ent_t *
167 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
168 {
169 	mlxcx_completionq_ent_t *ent;
170 	ddi_fm_error_t err;
171 	uint_t ci;
172 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
173 
174 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
175 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
176 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
177 
178 	/* mlcq_nents is always a power of 2 */
179 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
180 
181 	ent = &mlcq->mlcq_ent[ci];
182 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
183 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
184 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
185 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
186 	    DDI_FME_VERSION);
187 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
188 		/* The PRM says we have to membar here, so we're doing it */
189 		membar_consumer();
190 		++mlcq->mlcq_cc;
191 		return (ent);
192 	}
193 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
194 
195 	return (NULL);
196 }
197 
198 void
199 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
200 {
201 	ddi_fm_error_t err;
202 	uint_t try = 0;
203 
204 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
205 
206 retry:
207 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
208 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
209 	    DDI_FME_VERSION);
210 	if (err.fme_status != DDI_FM_OK) {
211 		if (try++ < mlxcx_doorbell_tries) {
212 			ddi_fm_dma_err_clear(
213 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
214 			    DDI_FME_VERSION);
215 			goto retry;
216 		} else {
217 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
218 			return;
219 		}
220 	}
221 }
222 
223 void
224 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
225 {
226 	bits32_t dbval = new_bits32();
227 	uint64_t udbval;
228 	ddi_fm_error_t err;
229 	uint_t try = 0;
230 
231 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
232 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
233 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
234 
235 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED)
236 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
237 
238 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
239 		return;
240 
241 	mlcq->mlcq_state |= MLXCX_CQ_ARMED;
242 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
243 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
244 
245 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
246 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
247 
248 	udbval = (uint64_t)from_bits32(dbval) << 32;
249 	udbval |= mlcq->mlcq_num & 0xffffff;
250 
251 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
252 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
253 
254 retry:
255 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
256 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
257 	    DDI_FME_VERSION);
258 	if (err.fme_status != DDI_FM_OK) {
259 		if (try++ < mlxcx_doorbell_tries) {
260 			ddi_fm_dma_err_clear(
261 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
262 			    DDI_FME_VERSION);
263 			goto retry;
264 		} else {
265 			goto err;
266 		}
267 	}
268 
269 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
270 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
271 	    DDI_FME_VERSION);
272 	if (err.fme_status == DDI_FM_OK)
273 		return;
274 	if (try++ < mlxcx_doorbell_tries) {
275 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
276 		goto retry;
277 	}
278 
279 err:
280 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
281 }
282 
283 const char *
284 mlxcx_event_name(mlxcx_event_t evt)
285 {
286 	switch (evt) {
287 	case MLXCX_EVENT_COMPLETION:
288 		return ("COMPLETION");
289 	case MLXCX_EVENT_PATH_MIGRATED:
290 		return ("PATH_MIGRATED");
291 	case MLXCX_EVENT_COMM_ESTABLISH:
292 		return ("COMM_ESTABLISH");
293 	case MLXCX_EVENT_SENDQ_DRAIN:
294 		return ("SENDQ_DRAIN");
295 	case MLXCX_EVENT_LAST_WQE:
296 		return ("LAST_WQE");
297 	case MLXCX_EVENT_SRQ_LIMIT:
298 		return ("SRQ_LIMIT");
299 	case MLXCX_EVENT_DCT_ALL_CLOSED:
300 		return ("DCT_ALL_CLOSED");
301 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
302 		return ("DCT_ACCKEY_VIOL");
303 	case MLXCX_EVENT_CQ_ERROR:
304 		return ("CQ_ERROR");
305 	case MLXCX_EVENT_WQ_CATASTROPHE:
306 		return ("WQ_CATASTROPHE");
307 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
308 		return ("PATH_MIGRATE_FAIL");
309 	case MLXCX_EVENT_PAGE_FAULT:
310 		return ("PAGE_FAULT");
311 	case MLXCX_EVENT_WQ_INVALID_REQ:
312 		return ("WQ_INVALID_REQ");
313 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
314 		return ("WQ_ACCESS_VIOL");
315 	case MLXCX_EVENT_SRQ_CATASTROPHE:
316 		return ("SRQ_CATASTROPHE");
317 	case MLXCX_EVENT_INTERNAL_ERROR:
318 		return ("INTERNAL_ERROR");
319 	case MLXCX_EVENT_PORT_STATE:
320 		return ("PORT_STATE");
321 	case MLXCX_EVENT_GPIO:
322 		return ("GPIO");
323 	case MLXCX_EVENT_PORT_MODULE:
324 		return ("PORT_MODULE");
325 	case MLXCX_EVENT_TEMP_WARNING:
326 		return ("TEMP_WARNING");
327 	case MLXCX_EVENT_REMOTE_CONFIG:
328 		return ("REMOTE_CONFIG");
329 	case MLXCX_EVENT_DCBX_CHANGE:
330 		return ("DCBX_CHANGE");
331 	case MLXCX_EVENT_DOORBELL_CONGEST:
332 		return ("DOORBELL_CONGEST");
333 	case MLXCX_EVENT_STALL_VL:
334 		return ("STALL_VL");
335 	case MLXCX_EVENT_CMD_COMPLETION:
336 		return ("CMD_COMPLETION");
337 	case MLXCX_EVENT_PAGE_REQUEST:
338 		return ("PAGE_REQUEST");
339 	case MLXCX_EVENT_NIC_VPORT:
340 		return ("NIC_VPORT");
341 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
342 		return ("EC_PARAMS_CHANGE");
343 	case MLXCX_EVENT_XRQ_ERROR:
344 		return ("XRQ_ERROR");
345 	}
346 	return ("UNKNOWN");
347 }
348 
349 /* Should be called only when link state has changed. */
350 void
351 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
352 {
353 	link_state_t ls;
354 
355 	mutex_enter(&port->mlp_mtx);
356 	(void) mlxcx_cmd_query_port_status(mlxp, port);
357 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
358 	(void) mlxcx_cmd_query_port_fec(mlxp, port);
359 
360 	switch (port->mlp_oper_status) {
361 	case MLXCX_PORT_STATUS_UP:
362 	case MLXCX_PORT_STATUS_UP_ONCE:
363 		ls = LINK_STATE_UP;
364 		break;
365 	case MLXCX_PORT_STATUS_DOWN:
366 		ls = LINK_STATE_DOWN;
367 		break;
368 	default:
369 		ls = LINK_STATE_UNKNOWN;
370 	}
371 	mac_link_update(mlxp->mlx_mac_hdl, ls);
372 
373 	mutex_exit(&port->mlp_mtx);
374 }
375 
376 static void
377 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
378 {
379 	ddi_device_acc_attr_t acc;
380 	ddi_dma_attr_t attr;
381 	mlxcx_dev_page_t *mdp;
382 	int32_t togive;
383 	mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
384 	uint_t i;
385 	const ddi_dma_cookie_t *ck;
386 
387 	togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
388 
389 	for (i = 0; i < togive; i++) {
390 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
391 		mlxcx_dma_acc_attr(mlxp, &acc);
392 		mlxcx_dma_page_attr(mlxp, &attr);
393 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
394 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
395 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
396 			    togive);
397 			goto cleanup_npages;
398 		}
399 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
400 		mdp->mxdp_pa = ck->dmac_laddress;
401 		pages[i] = mdp;
402 	}
403 
404 	mutex_enter(&mlxp->mlx_pagemtx);
405 
406 	if (!mlxcx_cmd_give_pages(mlxp,
407 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
408 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
409 		    "pages!", togive);
410 		goto cleanup_npages;
411 	}
412 
413 	for (i = 0; i < togive; i++) {
414 		avl_add(&mlxp->mlx_pages, pages[i]);
415 	}
416 	mlxp->mlx_npages += togive;
417 	mutex_exit(&mlxp->mlx_pagemtx);
418 
419 	return;
420 
421 cleanup_npages:
422 	for (i = 0; i < togive; i++) {
423 		mdp = pages[i];
424 		mlxcx_dma_free(&mdp->mxdp_dma);
425 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
426 	}
427 	/* Tell the hardware we had an allocation failure. */
428 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
429 	    0, NULL);
430 	mutex_exit(&mlxp->mlx_pagemtx);
431 }
432 
433 static void
434 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
435 {
436 	uint_t i;
437 	int32_t req, ret;
438 	uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
439 	mlxcx_dev_page_t *mdp, probe;
440 
441 	mutex_enter(&mlxp->mlx_pagemtx);
442 
443 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
444 	req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
445 
446 	if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
447 		return;
448 	}
449 
450 	for (i = 0; i < ret; i++) {
451 		bzero(&probe, sizeof (probe));
452 		probe.mxdp_pa = pas[i];
453 
454 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
455 
456 		if (mdp != NULL) {
457 			avl_remove(&mlxp->mlx_pages, mdp);
458 			mlxp->mlx_npages--;
459 			mlxcx_dma_free(&mdp->mxdp_dma);
460 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
461 		} else {
462 			mlxcx_warn(mlxp, "hardware returned a page "
463 			    "with PA 0x%" PRIx64 " but we have no "
464 			    "record of giving out such a page", pas[i]);
465 		}
466 	}
467 
468 	mutex_exit(&mlxp->mlx_pagemtx);
469 }
470 
471 static const char *
472 mlxcx_module_error_string(mlxcx_module_error_type_t err)
473 {
474 	switch (err) {
475 	case MLXCX_MODULE_ERR_POWER_BUDGET:
476 		return ("POWER_BUDGET");
477 	case MLXCX_MODULE_ERR_LONG_RANGE:
478 		return ("LONG_RANGE");
479 	case MLXCX_MODULE_ERR_BUS_STUCK:
480 		return ("BUS_STUCK");
481 	case MLXCX_MODULE_ERR_NO_EEPROM:
482 		return ("NO_EEPROM");
483 	case MLXCX_MODULE_ERR_ENFORCEMENT:
484 		return ("ENFORCEMENT");
485 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
486 		return ("UNKNOWN_IDENT");
487 	case MLXCX_MODULE_ERR_HIGH_TEMP:
488 		return ("HIGH_TEMP");
489 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
490 		return ("CABLE_SHORTED");
491 	default:
492 		return ("UNKNOWN");
493 	}
494 }
495 
496 static void
497 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
498 {
499 	uint64_t ena;
500 	char buf[FM_MAX_CLASS];
501 	const char *lename;
502 	const char *ename;
503 	const char *stname;
504 	uint_t eno = 0;
505 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
506 
507 	switch (state) {
508 	case MLXCX_MODULE_ERROR:
509 		stname = "error";
510 		eno = evd->mled_port_mod_error_type;
511 		lename = mlxcx_module_error_string(eno);
512 		switch (eno) {
513 		case MLXCX_MODULE_ERR_ENFORCEMENT:
514 			ename = DDI_FM_TXR_ERROR_WHITELIST;
515 			break;
516 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
517 		case MLXCX_MODULE_ERR_NO_EEPROM:
518 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
519 			break;
520 		case MLXCX_MODULE_ERR_HIGH_TEMP:
521 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
522 			break;
523 		case MLXCX_MODULE_ERR_POWER_BUDGET:
524 		case MLXCX_MODULE_ERR_LONG_RANGE:
525 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
526 			ename = DDI_FM_TXR_ERROR_HWFAIL;
527 			break;
528 		case MLXCX_MODULE_ERR_BUS_STUCK:
529 		default:
530 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
531 		}
532 		break;
533 	default:
534 		return;
535 	}
536 
537 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
538 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
539 	ena = fm_ena_generate(0, FM_ENA_FMT1);
540 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
541 		return;
542 
543 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
544 	    /* compulsory FM props */
545 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
546 	    /* generic NIC txr error event props */
547 	    "error", DATA_TYPE_STRING, ename,
548 	    "port_index", DATA_TYPE_UINT8, 0,
549 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
550 	    /* local props */
551 	    "mlxcx_state", DATA_TYPE_STRING, stname,
552 	    "mlxcx_error", DATA_TYPE_STRING, lename,
553 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
554 	    NULL);
555 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
556 }
557 
558 static uint_t
559 mlxcx_intr_0(caddr_t arg, caddr_t arg2)
560 {
561 	mlxcx_t *mlxp = (mlxcx_t *)arg;
562 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
563 	mlxcx_eventq_ent_t *ent;
564 	mlxcx_port_t *port;
565 	uint_t portn;
566 	int32_t npages = 0;
567 
568 	mutex_enter(&mleq->mleq_mtx);
569 
570 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
571 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
572 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
573 		mlxcx_warn(mlxp, "int %d on bad eq state",
574 		    mleq->mleq_intr_index);
575 		mutex_exit(&mleq->mleq_mtx);
576 		return (DDI_INTR_UNCLAIMED);
577 	}
578 
579 	ent = mlxcx_eq_next(mleq);
580 	if (ent == NULL) {
581 		mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
582 		mutex_exit(&mleq->mleq_mtx);
583 		return (DDI_INTR_UNCLAIMED);
584 	}
585 
586 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
587 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
588 
589 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
590 		switch (ent->mleqe_event_type) {
591 		case MLXCX_EVENT_PAGE_REQUEST:
592 			VERIFY3U(from_be16(ent->mleqe_page_request.
593 			    mled_page_request_function_id), ==, 0);
594 			npages += (int32_t)from_be32(ent->mleqe_page_request.
595 			    mled_page_request_num_pages);
596 			break;
597 		case MLXCX_EVENT_PORT_STATE:
598 			portn = get_bits8(
599 			    ent->mleqe_port_state.mled_port_state_port_num,
600 			    MLXCX_EVENT_PORT_NUM) - 1;
601 			if (portn >= mlxp->mlx_nports)
602 				break;
603 			port = &mlxp->mlx_ports[portn];
604 			mlxcx_update_link_state(mlxp, port);
605 			break;
606 		case MLXCX_EVENT_PORT_MODULE:
607 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
608 			break;
609 		default:
610 			mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
611 			    ent->mleqe_event_type, mleq->mleq_intr_index);
612 		}
613 	}
614 
615 	if (npages > 0) {
616 		mlxcx_give_pages_once(mlxp, npages);
617 	} else if (npages < 0) {
618 		mlxcx_take_pages_once(mlxp, -1 * npages);
619 	}
620 
621 	mlxcx_arm_eq(mlxp, mleq);
622 	mutex_exit(&mleq->mleq_mtx);
623 
624 	return (DDI_INTR_CLAIMED);
625 }
626 
627 static boolean_t
628 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
629     size_t bytelim)
630 {
631 	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
632 	mlxcx_completionq_ent_t *cent;
633 	mblk_t *mp, *cmp, *nmp;
634 	mlxcx_buffer_t *buf;
635 	boolean_t found, added;
636 	size_t bytes = 0;
637 	uint_t rx_frames = 0;
638 	uint_t comp_cnt = 0;
639 	int64_t wqebbs, bufcnt;
640 
641 	*mpp = NULL;
642 
643 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
644 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
645 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
646 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
647 		return (B_FALSE);
648 	}
649 
650 	nmp = cmp = mp = NULL;
651 
652 	wqebbs = 0;
653 	bufcnt = 0;
654 	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
655 	    cent = mlxcx_cq_next(mlcq)) {
656 		/*
657 		 * Teardown and ring stop can atomic_or this flag
658 		 * into our state if they want us to stop early.
659 		 */
660 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
661 			return (B_FALSE);
662 
663 		comp_cnt++;
664 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
665 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
666 			/* NOP */
667 			atomic_dec_64(&wq->mlwq_wqebb_used);
668 			goto nextcq;
669 		}
670 
671 lookagain:
672 		/*
673 		 * Generally the buffer we're looking for will be
674 		 * at the front of the list, so this loop won't
675 		 * need to look far.
676 		 */
677 		buf = list_head(&mlcq->mlcq_buffers);
678 		found = B_FALSE;
679 		while (buf != NULL) {
680 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
681 			    from_be16(cent->mlcqe_wqe_counter)) {
682 				found = B_TRUE;
683 				break;
684 			}
685 			buf = list_next(&mlcq->mlcq_buffers, buf);
686 		}
687 
688 		if (!found) {
689 			/*
690 			 * If there's any buffers waiting on the
691 			 * buffers_b list, then merge those into
692 			 * the main list and have another look.
693 			 *
694 			 * The wq enqueue routines push new buffers
695 			 * into buffers_b so that they can avoid
696 			 * taking the mlcq_mtx and blocking us for
697 			 * every single packet.
698 			 */
699 			added = B_FALSE;
700 			mutex_enter(&mlcq->mlcq_bufbmtx);
701 			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
702 				list_move_tail(&mlcq->mlcq_buffers,
703 				    &mlcq->mlcq_buffers_b);
704 				added = B_TRUE;
705 			}
706 			mutex_exit(&mlcq->mlcq_bufbmtx);
707 			if (added)
708 				goto lookagain;
709 
710 			buf = list_head(&mlcq->mlcq_buffers);
711 			mlxcx_warn(mlxp, "got completion on CQ %x but "
712 			    "no buffer matching wqe found: %x (first "
713 			    "buffer counter = %x)", mlcq->mlcq_num,
714 			    from_be16(cent->mlcqe_wqe_counter),
715 			    buf == NULL ? UINT32_MAX :
716 			    buf->mlb_wqe_index);
717 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
718 			goto nextcq;
719 		}
720 
721 		/*
722 		 * The buf is likely to be freed below, count this now.
723 		 */
724 		wqebbs += buf->mlb_wqebbs;
725 
726 		list_remove(&mlcq->mlcq_buffers, buf);
727 		bufcnt++;
728 
729 		switch (mlcq->mlcq_wq->mlwq_type) {
730 		case MLXCX_WQ_TYPE_SENDQ:
731 			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
732 			break;
733 		case MLXCX_WQ_TYPE_RECVQ:
734 			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
735 			bytes += from_be32(cent->mlcqe_byte_cnt);
736 			if (nmp != NULL) {
737 				if (cmp != NULL) {
738 					cmp->b_next = nmp;
739 					cmp = nmp;
740 				} else {
741 					mp = cmp = nmp;
742 				}
743 
744 				rx_frames++;
745 			}
746 			break;
747 		}
748 
749 		/*
750 		 * Update the consumer index with what has been processed,
751 		 * followed by driver counters. It is important to tell the
752 		 * hardware first, otherwise when we throw more packets at
753 		 * it, it may get an overflow error.
754 		 * We do this whenever we've processed enough to bridge the
755 		 * high->low water mark.
756 		 */
757 		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
758 			mlxcx_update_cqci(mlxp, mlcq);
759 			/*
760 			 * Both these variables are incremented using
761 			 * atomics as they are modified in other code paths
762 			 * (Eg during tx) which hold different locks.
763 			 */
764 			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
765 			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
766 			wqebbs = 0;
767 			bufcnt = 0;
768 			comp_cnt = 0;
769 		}
770 nextcq:
771 		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
772 		    (bytelim != 0 && bytes > bytelim))
773 			break;
774 	}
775 
776 	if (comp_cnt > 0) {
777 		mlxcx_update_cqci(mlxp, mlcq);
778 		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
779 		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
780 	}
781 
782 	*mpp = mp;
783 	return (B_TRUE);
784 }
785 
786 
787 mblk_t *
788 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
789 {
790 	mblk_t *mp = NULL;
791 
792 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
793 
794 	ASSERT(mlcq->mlcq_wq != NULL);
795 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
796 
797 	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
798 
799 	return (mp);
800 }
801 
802 static uint_t
803 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
804 {
805 	mlxcx_t *mlxp = (mlxcx_t *)arg;
806 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
807 	mlxcx_eventq_ent_t *ent;
808 	mlxcx_completion_queue_t *mlcq, probe;
809 	mlxcx_work_queue_t *mlwq;
810 	mblk_t *mp = NULL;
811 	boolean_t tellmac = B_FALSE;
812 
813 	mutex_enter(&mleq->mleq_mtx);
814 
815 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
816 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
817 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
818 		mutex_exit(&mleq->mleq_mtx);
819 		return (DDI_INTR_CLAIMED);
820 	}
821 
822 	ent = mlxcx_eq_next(mleq);
823 	if (ent == NULL) {
824 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
825 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
826 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
827 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
828 			    mleq->mleq_intr_index]);
829 		}
830 		mutex_exit(&mleq->mleq_mtx);
831 		return (DDI_INTR_CLAIMED);
832 	}
833 	mleq->mleq_badintrs = 0;
834 
835 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
836 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
837 
838 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
839 		if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) {
840 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
841 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
842 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
843 			    mleq->mleq_intr_index]);
844 			mutex_exit(&mleq->mleq_mtx);
845 			return (DDI_INTR_CLAIMED);
846 		}
847 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
848 
849 		probe.mlcq_num =
850 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
851 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
852 
853 		if (mlcq == NULL)
854 			continue;
855 
856 		mlwq = mlcq->mlcq_wq;
857 
858 		/*
859 		 * The polling function might have the mutex and stop us from
860 		 * getting the lock in mlxcx_process_cq(), so we increment
861 		 * the event counter atomically from outside.
862 		 *
863 		 * This way at the end of polling when we go back to interrupts
864 		 * from this CQ, the event counter is still correct.
865 		 *
866 		 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so
867 		 * as to avoid any possibility of racing against us here, so we
868 		 * only have to consider mlxcx_rx_poll().
869 		 */
870 		atomic_inc_32(&mlcq->mlcq_ec);
871 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
872 
873 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
874 			/*
875 			 * If we failed to take the mutex because the
876 			 * polling function has it, just move on.
877 			 * We don't want to block other CQs behind
878 			 * this one.
879 			 */
880 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
881 				goto update_eq;
882 
883 			/* Otherwise we will wait. */
884 			mutex_enter(&mlcq->mlcq_mtx);
885 		}
886 
887 		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
888 		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
889 			/*
890 			 * The ring is not in polling mode and we processed
891 			 * some completion queue entries.
892 			 */
893 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
894 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
895 				atomic_and_uint(&mlcq->mlcq_state,
896 				    ~MLXCX_CQ_BLOCKED_MAC);
897 				tellmac = B_TRUE;
898 			}
899 
900 			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
901 			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
902 				atomic_and_uint(&mlwq->mlwq_state,
903 				    ~MLXCX_WQ_BLOCKED_MAC);
904 				tellmac = B_TRUE;
905 			}
906 
907 			mlxcx_arm_cq(mlxp, mlcq);
908 
909 			mutex_exit(&mlcq->mlcq_mtx);
910 
911 			if (tellmac) {
912 				mac_tx_ring_update(mlxp->mlx_mac_hdl,
913 				    mlcq->mlcq_mac_hdl);
914 				tellmac = B_FALSE;
915 			}
916 
917 			if (mp != NULL) {
918 				mac_rx_ring(mlxp->mlx_mac_hdl,
919 				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
920 			}
921 		} else {
922 			mutex_exit(&mlcq->mlcq_mtx);
923 		}
924 
925 update_eq:
926 		/*
927 		 * Updating the consumer counter for an EQ requires a write
928 		 * to the UAR, which is possibly expensive.
929 		 *
930 		 * Try to do it only often enough to stop us wrapping around.
931 		 */
932 		if ((mleq->mleq_cc & 0x7) == 0)
933 			mlxcx_update_eq(mlxp, mleq);
934 	}
935 
936 	mlxcx_arm_eq(mlxp, mleq);
937 	mutex_exit(&mleq->mleq_mtx);
938 
939 	return (DDI_INTR_CLAIMED);
940 }
941 
942 boolean_t
943 mlxcx_intr_setup(mlxcx_t *mlxp)
944 {
945 	dev_info_t *dip = mlxp->mlx_dip;
946 	int ret;
947 	int nintrs = 0;
948 	int navail = 0;
949 	int types, i;
950 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
951 
952 	ret = ddi_intr_get_supported_types(dip, &types);
953 	if (ret != DDI_SUCCESS) {
954 		return (B_FALSE);
955 	}
956 
957 	if (!(types & DDI_INTR_TYPE_MSIX)) {
958 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
959 		    "requires MSI-X");
960 		return (B_FALSE);
961 	}
962 
963 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
964 	if (ret != DDI_SUCCESS) {
965 		return (B_FALSE);
966 	}
967 	if (nintrs < 2) {
968 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
969 		    "requires 2", nintrs);
970 		return (B_FALSE);
971 	}
972 
973 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
974 	if (navail < 2) {
975 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
976 		    "requires 2", navail);
977 		return (B_FALSE);
978 	}
979 
980 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
981 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
982 
983 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
984 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
985 	if (ret != DDI_SUCCESS) {
986 		mlxcx_intr_teardown(mlxp);
987 		return (B_FALSE);
988 	}
989 	if (mlxp->mlx_intr_count < 2) {
990 		mlxcx_intr_teardown(mlxp);
991 		return (B_FALSE);
992 	}
993 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
994 
995 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
996 	if (ret != DDI_SUCCESS) {
997 		mlxcx_intr_teardown(mlxp);
998 		return (B_FALSE);
999 	}
1000 
1001 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
1002 	    sizeof (mlxcx_event_queue_t);
1003 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
1004 
1005 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0,
1006 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
1007 	if (ret != DDI_SUCCESS) {
1008 		mlxcx_intr_teardown(mlxp);
1009 		return (B_FALSE);
1010 	}
1011 
1012 	/*
1013 	 * If we have enough interrupts, set their "type" fields so that we
1014 	 * avoid mixing RX and TX queues on the same EQs.
1015 	 */
1016 	if (mlxp->mlx_intr_count >= 8) {
1017 		eqt = MLXCX_EQ_TYPE_RX;
1018 	}
1019 
1020 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
1021 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
1022 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1023 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
1024 		    sizeof (mlxcx_completion_queue_t),
1025 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
1026 		mlxp->mlx_eqs[i].mleq_intr_index = i;
1027 
1028 		mlxp->mlx_eqs[i].mleq_type = eqt;
1029 		/*
1030 		 * If eqt is still ANY, just leave it set to that
1031 		 * (no else here).
1032 		 */
1033 		if (eqt == MLXCX_EQ_TYPE_RX) {
1034 			eqt = MLXCX_EQ_TYPE_TX;
1035 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
1036 			eqt = MLXCX_EQ_TYPE_RX;
1037 		}
1038 
1039 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1040 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1041 		if (ret != DDI_SUCCESS) {
1042 			mlxcx_intr_teardown(mlxp);
1043 			return (B_FALSE);
1044 		}
1045 	}
1046 
1047 	return (B_TRUE);
1048 }
1049