xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_intr.c (revision c1e9bf00765d7ac9cf1986575e4489dd8710d9b1)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2020, the University of Queensland
14  */
15 
16 /*
17  * Mellanox Connect-X 4/5/6 driver.
18  */
19 
20 #include <sys/modctl.h>
21 #include <sys/conf.h>
22 #include <sys/devops.h>
23 #include <sys/sysmacros.h>
24 
25 #include <sys/mac_provider.h>
26 
27 #include <mlxcx.h>
28 
29 void
30 mlxcx_intr_teardown(mlxcx_t *mlxp)
31 {
32 	int i;
33 	int ret;
34 
35 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
36 		mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];
37 		mutex_enter(&mleq->mleq_mtx);
38 		VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
39 		if (mleq->mleq_state & MLXCX_EQ_CREATED)
40 			VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
41 		if (i != 0) {
42 			VERIFY(avl_is_empty(&mleq->mleq_cqs));
43 			avl_destroy(&mleq->mleq_cqs);
44 		}
45 		mutex_exit(&mleq->mleq_mtx);
46 		(void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);
47 		(void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
48 		ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
49 		if (ret != DDI_SUCCESS) {
50 			mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
51 			    i, ret);
52 		}
53 		mutex_destroy(&mleq->mleq_mtx);
54 	}
55 	kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
56 	kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
57 	mlxp->mlx_intr_handles = NULL;
58 	mlxp->mlx_eqs = NULL;
59 }
60 
61 /*
62  * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
63  */
64 static mlxcx_eventq_ent_t *
65 mlxcx_eq_next(mlxcx_event_queue_t *mleq)
66 {
67 	mlxcx_eventq_ent_t *ent;
68 	ddi_fm_error_t err;
69 	uint_t ci;
70 	const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);
71 
72 	ASSERT(mutex_owned(&mleq->mleq_mtx));
73 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
74 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
75 
76 	/* mleq_nents is always a power of 2 */
77 	ci = mleq->mleq_cc & (mleq->mleq_nents - 1);
78 
79 	ent = &mleq->mleq_ent[ci];
80 	VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
81 	    (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
82 	    sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
83 	ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
84 	    DDI_FME_VERSION);
85 	if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
86 		/* The PRM says we have to membar here, so we're doing it */
87 		membar_consumer();
88 		++mleq->mleq_cc;
89 		return (ent);
90 	}
91 	/*
92 	 * In the case of a DMA error, we should re-arm this EQ and then come
93 	 * back and try again when the device wakes us back up.
94 	 *
95 	 * Hopefully the fault will be gone by then.
96 	 */
97 	ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);
98 
99 	return (NULL);
100 }
101 
102 void
103 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
104 {
105 	uint_t try = 0;
106 	ddi_fm_error_t err;
107 	bits32_t v = new_bits32();
108 
109 	ASSERT(mutex_owned(&mleq->mleq_mtx));
110 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
111 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
112 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
113 	ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);
114 
115 	mleq->mleq_state |= MLXCX_EQ_ARMED;
116 	mleq->mleq_cc_armed = mleq->mleq_cc;
117 
118 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
119 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
120 
121 retry:
122 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
123 	    from_bits32(v));
124 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
125 	    DDI_FME_VERSION);
126 	if (err.fme_status == DDI_FM_OK)
127 		return;
128 	if (try++ < mlxcx_doorbell_tries) {
129 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
130 		goto retry;
131 	}
132 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
133 }
134 
135 static void
136 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
137 {
138 	bits32_t v = new_bits32();
139 	ddi_fm_error_t err;
140 
141 	ASSERT(mutex_owned(&mleq->mleq_mtx));
142 	ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
143 	ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
144 	ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
145 
146 	set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
147 	set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);
148 
149 	mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
150 	    from_bits32(v));
151 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
152 	    DDI_FME_VERSION);
153 	ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
154 	/*
155 	 * Ignore the error, if it's still happening when we try to re-arm the
156 	 * EQ, we will note the impact then.
157 	 */
158 }
159 
160 static mlxcx_completionq_ent_t *
161 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
162 {
163 	mlxcx_completionq_ent_t *ent;
164 	ddi_fm_error_t err;
165 	uint_t ci;
166 	const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);
167 
168 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
169 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
170 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
171 
172 	/* mlcq_nents is always a power of 2 */
173 	ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);
174 
175 	ent = &mlcq->mlcq_ent[ci];
176 	VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
177 	    (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
178 	    sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
179 	ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
180 	    DDI_FME_VERSION);
181 	if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
182 		/* The PRM says we have to membar here, so we're doing it */
183 		membar_consumer();
184 		++mlcq->mlcq_cc;
185 		return (ent);
186 	}
187 	ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);
188 
189 	return (NULL);
190 }
191 
192 void
193 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
194 {
195 	bits32_t dbval = new_bits32();
196 	uint64_t udbval;
197 	ddi_fm_error_t err;
198 	uint_t try = 0;
199 
200 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
201 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
202 	ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);
203 
204 	if (mlcq->mlcq_state & MLXCX_CQ_ARMED)
205 		ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
206 
207 	if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
208 		return;
209 
210 	mlcq->mlcq_state |= MLXCX_CQ_ARMED;
211 	mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
212 	mlcq->mlcq_ec_armed = mlcq->mlcq_ec;
213 
214 	set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
215 	set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);
216 
217 	udbval = (uint64_t)from_bits32(dbval) << 32;
218 	udbval |= mlcq->mlcq_num & 0xffffff;
219 
220 	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
221 	mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;
222 
223 retry:
224 	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
225 	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
226 	    DDI_FME_VERSION);
227 	if (err.fme_status != DDI_FM_OK) {
228 		if (try++ < mlxcx_doorbell_tries) {
229 			ddi_fm_dma_err_clear(
230 			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
231 			    DDI_FME_VERSION);
232 			goto retry;
233 		} else {
234 			goto err;
235 		}
236 	}
237 
238 	mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
239 	ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
240 	    DDI_FME_VERSION);
241 	if (err.fme_status == DDI_FM_OK)
242 		return;
243 	if (try++ < mlxcx_doorbell_tries) {
244 		ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
245 		goto retry;
246 	}
247 
248 err:
249 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
250 }
251 
252 const char *
253 mlxcx_event_name(mlxcx_event_t evt)
254 {
255 	switch (evt) {
256 	case MLXCX_EVENT_COMPLETION:
257 		return ("COMPLETION");
258 	case MLXCX_EVENT_PATH_MIGRATED:
259 		return ("PATH_MIGRATED");
260 	case MLXCX_EVENT_COMM_ESTABLISH:
261 		return ("COMM_ESTABLISH");
262 	case MLXCX_EVENT_SENDQ_DRAIN:
263 		return ("SENDQ_DRAIN");
264 	case MLXCX_EVENT_LAST_WQE:
265 		return ("LAST_WQE");
266 	case MLXCX_EVENT_SRQ_LIMIT:
267 		return ("SRQ_LIMIT");
268 	case MLXCX_EVENT_DCT_ALL_CLOSED:
269 		return ("DCT_ALL_CLOSED");
270 	case MLXCX_EVENT_DCT_ACCKEY_VIOL:
271 		return ("DCT_ACCKEY_VIOL");
272 	case MLXCX_EVENT_CQ_ERROR:
273 		return ("CQ_ERROR");
274 	case MLXCX_EVENT_WQ_CATASTROPHE:
275 		return ("WQ_CATASTROPHE");
276 	case MLXCX_EVENT_PATH_MIGRATE_FAIL:
277 		return ("PATH_MIGRATE_FAIL");
278 	case MLXCX_EVENT_PAGE_FAULT:
279 		return ("PAGE_FAULT");
280 	case MLXCX_EVENT_WQ_INVALID_REQ:
281 		return ("WQ_INVALID_REQ");
282 	case MLXCX_EVENT_WQ_ACCESS_VIOL:
283 		return ("WQ_ACCESS_VIOL");
284 	case MLXCX_EVENT_SRQ_CATASTROPHE:
285 		return ("SRQ_CATASTROPHE");
286 	case MLXCX_EVENT_INTERNAL_ERROR:
287 		return ("INTERNAL_ERROR");
288 	case MLXCX_EVENT_PORT_STATE:
289 		return ("PORT_STATE");
290 	case MLXCX_EVENT_GPIO:
291 		return ("GPIO");
292 	case MLXCX_EVENT_PORT_MODULE:
293 		return ("PORT_MODULE");
294 	case MLXCX_EVENT_TEMP_WARNING:
295 		return ("TEMP_WARNING");
296 	case MLXCX_EVENT_REMOTE_CONFIG:
297 		return ("REMOTE_CONFIG");
298 	case MLXCX_EVENT_DCBX_CHANGE:
299 		return ("DCBX_CHANGE");
300 	case MLXCX_EVENT_DOORBELL_CONGEST:
301 		return ("DOORBELL_CONGEST");
302 	case MLXCX_EVENT_STALL_VL:
303 		return ("STALL_VL");
304 	case MLXCX_EVENT_CMD_COMPLETION:
305 		return ("CMD_COMPLETION");
306 	case MLXCX_EVENT_PAGE_REQUEST:
307 		return ("PAGE_REQUEST");
308 	case MLXCX_EVENT_NIC_VPORT:
309 		return ("NIC_VPORT");
310 	case MLXCX_EVENT_EC_PARAMS_CHANGE:
311 		return ("EC_PARAMS_CHANGE");
312 	case MLXCX_EVENT_XRQ_ERROR:
313 		return ("XRQ_ERROR");
314 	}
315 	return ("UNKNOWN");
316 }
317 
318 /* Should be called only when link state has changed. */
319 void
320 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
321 {
322 	link_state_t ls;
323 
324 	mutex_enter(&port->mlp_mtx);
325 	(void) mlxcx_cmd_query_port_status(mlxp, port);
326 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
327 
328 	switch (port->mlp_oper_status) {
329 	case MLXCX_PORT_STATUS_UP:
330 	case MLXCX_PORT_STATUS_UP_ONCE:
331 		ls = LINK_STATE_UP;
332 		break;
333 	case MLXCX_PORT_STATUS_DOWN:
334 		ls = LINK_STATE_DOWN;
335 		break;
336 	default:
337 		ls = LINK_STATE_UNKNOWN;
338 	}
339 	mac_link_update(mlxp->mlx_mac_hdl, ls);
340 
341 	mutex_exit(&port->mlp_mtx);
342 }
343 
344 static void
345 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
346 {
347 	ddi_device_acc_attr_t acc;
348 	ddi_dma_attr_t attr;
349 	mlxcx_dev_page_t *mdp;
350 	int32_t togive;
351 	mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
352 	uint_t i;
353 	const ddi_dma_cookie_t *ck;
354 
355 	togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
356 
357 	for (i = 0; i < togive; i++) {
358 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
359 		mlxcx_dma_acc_attr(mlxp, &acc);
360 		mlxcx_dma_page_attr(mlxp, &attr);
361 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
362 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
363 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
364 			    togive);
365 			goto cleanup_npages;
366 		}
367 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
368 		mdp->mxdp_pa = ck->dmac_laddress;
369 		pages[i] = mdp;
370 	}
371 
372 	mutex_enter(&mlxp->mlx_pagemtx);
373 
374 	if (!mlxcx_cmd_give_pages(mlxp,
375 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
376 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
377 		    "pages!", togive);
378 		goto cleanup_npages;
379 	}
380 
381 	for (i = 0; i < togive; i++) {
382 		avl_add(&mlxp->mlx_pages, pages[i]);
383 	}
384 	mlxp->mlx_npages += togive;
385 	mutex_exit(&mlxp->mlx_pagemtx);
386 
387 	return;
388 
389 cleanup_npages:
390 	for (i = 0; i < togive; i++) {
391 		mdp = pages[i];
392 		mlxcx_dma_free(&mdp->mxdp_dma);
393 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
394 	}
395 	/* Tell the hardware we had an allocation failure. */
396 	(void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
397 	    0, NULL);
398 	mutex_exit(&mlxp->mlx_pagemtx);
399 }
400 
401 static void
402 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
403 {
404 	uint_t i;
405 	int32_t req, ret;
406 	uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
407 	mlxcx_dev_page_t *mdp, probe;
408 
409 	mutex_enter(&mlxp->mlx_pagemtx);
410 
411 	ASSERT0(avl_is_empty(&mlxp->mlx_pages));
412 	req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
413 
414 	if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
415 		return;
416 	}
417 
418 	for (i = 0; i < ret; i++) {
419 		bzero(&probe, sizeof (probe));
420 		probe.mxdp_pa = pas[i];
421 
422 		mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
423 
424 		if (mdp != NULL) {
425 			avl_remove(&mlxp->mlx_pages, mdp);
426 			mlxp->mlx_npages--;
427 			mlxcx_dma_free(&mdp->mxdp_dma);
428 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
429 		} else {
430 			mlxcx_warn(mlxp, "hardware returned a page "
431 			    "with PA 0x%" PRIx64 " but we have no "
432 			    "record of giving out such a page", pas[i]);
433 		}
434 	}
435 
436 	mutex_exit(&mlxp->mlx_pagemtx);
437 }
438 
439 static const char *
440 mlxcx_module_error_string(mlxcx_module_error_type_t err)
441 {
442 	switch (err) {
443 	case MLXCX_MODULE_ERR_POWER_BUDGET:
444 		return ("POWER_BUDGET");
445 	case MLXCX_MODULE_ERR_LONG_RANGE:
446 		return ("LONG_RANGE");
447 	case MLXCX_MODULE_ERR_BUS_STUCK:
448 		return ("BUS_STUCK");
449 	case MLXCX_MODULE_ERR_NO_EEPROM:
450 		return ("NO_EEPROM");
451 	case MLXCX_MODULE_ERR_ENFORCEMENT:
452 		return ("ENFORCEMENT");
453 	case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
454 		return ("UNKNOWN_IDENT");
455 	case MLXCX_MODULE_ERR_HIGH_TEMP:
456 		return ("HIGH_TEMP");
457 	case MLXCX_MODULE_ERR_CABLE_SHORTED:
458 		return ("CABLE_SHORTED");
459 	default:
460 		return ("UNKNOWN");
461 	}
462 }
463 
464 static void
465 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
466 {
467 	uint64_t ena;
468 	char buf[FM_MAX_CLASS];
469 	const char *lename;
470 	const char *ename;
471 	const char *stname;
472 	uint_t eno = 0;
473 	mlxcx_module_status_t state = evd->mled_port_mod_module_status;
474 
475 	switch (state) {
476 	case MLXCX_MODULE_ERROR:
477 		stname = "error";
478 		eno = evd->mled_port_mod_error_type;
479 		lename = mlxcx_module_error_string(eno);
480 		switch (eno) {
481 		case MLXCX_MODULE_ERR_ENFORCEMENT:
482 			ename = DDI_FM_TXR_ERROR_WHITELIST;
483 			break;
484 		case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
485 		case MLXCX_MODULE_ERR_NO_EEPROM:
486 			ename = DDI_FM_TXR_ERROR_NOTSUPP;
487 			break;
488 		case MLXCX_MODULE_ERR_HIGH_TEMP:
489 			ename = DDI_FM_TXR_ERROR_OVERTEMP;
490 			break;
491 		case MLXCX_MODULE_ERR_POWER_BUDGET:
492 		case MLXCX_MODULE_ERR_LONG_RANGE:
493 		case MLXCX_MODULE_ERR_CABLE_SHORTED:
494 			ename = DDI_FM_TXR_ERROR_HWFAIL;
495 			break;
496 		case MLXCX_MODULE_ERR_BUS_STUCK:
497 		default:
498 			ename = DDI_FM_TXR_ERROR_UNKNOWN;
499 		}
500 		break;
501 	default:
502 		return;
503 	}
504 
505 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
506 	    DDI_FM_NIC, DDI_FM_TXR_ERROR);
507 	ena = fm_ena_generate(0, FM_ENA_FMT1);
508 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
509 		return;
510 
511 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
512 	    /* compulsory FM props */
513 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
514 	    /* generic NIC txr error event props */
515 	    "error", DATA_TYPE_STRING, ename,
516 	    "port_index", DATA_TYPE_UINT8, 0,
517 	    "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
518 	    /* local props */
519 	    "mlxcx_state", DATA_TYPE_STRING, stname,
520 	    "mlxcx_error", DATA_TYPE_STRING, lename,
521 	    "mlxcx_error_num", DATA_TYPE_UINT8, eno,
522 	    NULL);
523 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
524 }
525 
526 static uint_t
527 mlxcx_intr_0(caddr_t arg, caddr_t arg2)
528 {
529 	mlxcx_t *mlxp = (mlxcx_t *)arg;
530 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
531 	mlxcx_eventq_ent_t *ent;
532 	mlxcx_port_t *port;
533 	uint_t portn;
534 	int32_t npages = 0;
535 
536 	mutex_enter(&mleq->mleq_mtx);
537 
538 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
539 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
540 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
541 		mlxcx_warn(mlxp, "int0 on bad eq state");
542 		mutex_exit(&mleq->mleq_mtx);
543 		return (DDI_INTR_UNCLAIMED);
544 	}
545 
546 	ent = mlxcx_eq_next(mleq);
547 	if (ent == NULL) {
548 		mlxcx_warn(mlxp, "spurious int 0?");
549 		mutex_exit(&mleq->mleq_mtx);
550 		return (DDI_INTR_UNCLAIMED);
551 	}
552 
553 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
554 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
555 
556 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
557 		switch (ent->mleqe_event_type) {
558 		case MLXCX_EVENT_PAGE_REQUEST:
559 			VERIFY3U(from_be16(ent->mleqe_page_request.
560 			    mled_page_request_function_id), ==, 0);
561 			npages += (int32_t)from_be32(ent->mleqe_page_request.
562 			    mled_page_request_num_pages);
563 			break;
564 		case MLXCX_EVENT_PORT_STATE:
565 			portn = get_bits8(
566 			    ent->mleqe_port_state.mled_port_state_port_num,
567 			    MLXCX_EVENT_PORT_NUM) - 1;
568 			if (portn >= mlxp->mlx_nports)
569 				break;
570 			port = &mlxp->mlx_ports[portn];
571 			mlxcx_update_link_state(mlxp, port);
572 			break;
573 		case MLXCX_EVENT_PORT_MODULE:
574 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
575 			break;
576 		default:
577 			mlxcx_warn(mlxp, "unhandled event 0x%x on int0",
578 			    ent->mleqe_event_type);
579 		}
580 	}
581 
582 	if (npages > 0) {
583 		mlxcx_give_pages_once(mlxp, npages);
584 	} else if (npages < 0) {
585 		mlxcx_take_pages_once(mlxp, -1 * npages);
586 	}
587 
588 	mlxcx_arm_eq(mlxp, mleq);
589 	mutex_exit(&mleq->mleq_mtx);
590 
591 	return (DDI_INTR_CLAIMED);
592 }
593 
594 mblk_t *
595 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
596 {
597 	mlxcx_buffer_t *buf;
598 	mblk_t *mp, *cmp, *nmp;
599 	mlxcx_completionq_ent_t *cent;
600 	size_t bytes = 0;
601 	boolean_t found;
602 
603 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
604 
605 	ASSERT(mlcq->mlcq_wq != NULL);
606 	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
607 
608 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
609 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
610 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
611 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
612 		return (NULL);
613 	}
614 
615 	ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING);
616 
617 	nmp = cmp = mp = NULL;
618 
619 	cent = mlxcx_cq_next(mlcq);
620 	for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
621 		/*
622 		 * Teardown and ring stop can atomic_or this flag
623 		 * into our state if they want us to stop early.
624 		 */
625 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
626 			break;
627 
628 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
629 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
630 			/* NOP */
631 			goto nextcq;
632 		}
633 
634 		buf = list_head(&mlcq->mlcq_buffers);
635 		found = B_FALSE;
636 		while (buf != NULL) {
637 			if ((buf->mlb_wqe_index & UINT16_MAX) ==
638 			    from_be16(cent->mlcqe_wqe_counter)) {
639 				found = B_TRUE;
640 				break;
641 			}
642 			buf = list_next(&mlcq->mlcq_buffers, buf);
643 		}
644 		if (!found) {
645 			buf = list_head(&mlcq->mlcq_buffers);
646 			mlxcx_warn(mlxp, "got completion on CQ %x but "
647 			    "no buffer matching wqe found: %x (first "
648 			    "buffer counter = %x)", mlcq->mlcq_num,
649 			    from_be16(cent->mlcqe_wqe_counter),
650 			    buf == NULL ? UINT32_MAX : buf->mlb_wqe_index);
651 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
652 			goto nextcq;
653 		}
654 		list_remove(&mlcq->mlcq_buffers, buf);
655 		atomic_dec_64(&mlcq->mlcq_bufcnt);
656 
657 		nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
658 		if (nmp != NULL) {
659 			bytes += from_be32(cent->mlcqe_byte_cnt);
660 			if (cmp != NULL) {
661 				cmp->b_next = nmp;
662 				cmp = nmp;
663 			} else {
664 				mp = cmp = nmp;
665 			}
666 		}
667 nextcq:
668 		mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
669 
670 		if (bytelim != 0 && bytes > bytelim)
671 			break;
672 	}
673 
674 	return (mp);
675 }
676 
677 static uint_t
678 mlxcx_intr_n(caddr_t arg, caddr_t arg2)
679 {
680 	mlxcx_t *mlxp = (mlxcx_t *)arg;
681 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
682 	mlxcx_eventq_ent_t *ent;
683 	mlxcx_completionq_ent_t *cent;
684 	mlxcx_completion_queue_t *mlcq, probe;
685 	mlxcx_buffer_t *buf;
686 	mblk_t *mp, *cmp, *nmp;
687 	boolean_t found, tellmac = B_FALSE, added;
688 
689 	mutex_enter(&mleq->mleq_mtx);
690 
691 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
692 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
693 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
694 		mutex_exit(&mleq->mleq_mtx);
695 		return (DDI_INTR_CLAIMED);
696 	}
697 
698 	ent = mlxcx_eq_next(mleq);
699 	if (ent == NULL) {
700 		if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
701 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
702 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
703 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
704 			    mleq->mleq_intr_index]);
705 		}
706 		mutex_exit(&mleq->mleq_mtx);
707 		return (DDI_INTR_CLAIMED);
708 	}
709 	mleq->mleq_badintrs = 0;
710 
711 	ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
712 	mleq->mleq_state &= ~MLXCX_EQ_ARMED;
713 
714 	for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
715 		if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) {
716 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
717 			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
718 			(void) ddi_intr_disable(mlxp->mlx_intr_handles[
719 			    mleq->mleq_intr_index]);
720 			mutex_exit(&mleq->mleq_mtx);
721 			return (DDI_INTR_CLAIMED);
722 		}
723 		ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);
724 
725 		probe.mlcq_num =
726 		    from_be24(ent->mleqe_completion.mled_completion_cqn);
727 		mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
728 
729 		if (mlcq == NULL)
730 			continue;
731 
732 		/*
733 		 * The polling function might have the mutex and stop us from
734 		 * getting the lock here, so we increment the event counter
735 		 * atomically from outside.
736 		 *
737 		 * This way at the end of polling when we go back to interrupts
738 		 * from this CQ, the event counter is still correct.
739 		 *
740 		 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so
741 		 * as to avoid any possibility of racing against us here, so we
742 		 * only have to consider mlxcx_rx_poll().
743 		 */
744 		atomic_inc_32(&mlcq->mlcq_ec);
745 		atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);
746 
747 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
748 			/*
749 			 * If we failed to take the mutex because the polling
750 			 * function has it, just move on. We don't want to
751 			 * block other CQs behind this one.
752 			 */
753 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
754 				continue;
755 			/* Otherwise we will wait. */
756 			mutex_enter(&mlcq->mlcq_mtx);
757 		}
758 
759 		if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
760 		    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
761 		    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
762 		    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) ||
763 		    (mlcq->mlcq_state & MLXCX_CQ_POLLING)) {
764 			mutex_exit(&mlcq->mlcq_mtx);
765 			continue;
766 		}
767 
768 		nmp = cmp = mp = NULL;
769 		tellmac = B_FALSE;
770 
771 		cent = mlxcx_cq_next(mlcq);
772 		for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
773 			/*
774 			 * Teardown and ring stop can atomic_or this flag
775 			 * into our state if they want us to stop early.
776 			 */
777 			if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
778 				break;
779 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
780 				break;
781 
782 			if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
783 			    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
784 				/* NOP */
785 				goto nextcq;
786 			}
787 
788 lookagain:
789 			/*
790 			 * Generally the buffer we're looking for will be
791 			 * at the front of the list, so this loop won't
792 			 * need to look far.
793 			 */
794 			buf = list_head(&mlcq->mlcq_buffers);
795 			found = B_FALSE;
796 			while (buf != NULL) {
797 				if ((buf->mlb_wqe_index & UINT16_MAX) ==
798 				    from_be16(cent->mlcqe_wqe_counter)) {
799 					found = B_TRUE;
800 					break;
801 				}
802 				buf = list_next(&mlcq->mlcq_buffers, buf);
803 			}
804 			if (!found) {
805 				/*
806 				 * If there's any buffers waiting on the
807 				 * buffers_b list, then merge those into
808 				 * the main list and have another look.
809 				 *
810 				 * The wq enqueue routines push new buffers
811 				 * into buffers_b so that they can avoid
812 				 * taking the mlcq_mtx and blocking us for
813 				 * every single packet.
814 				 */
815 				added = B_FALSE;
816 				mutex_enter(&mlcq->mlcq_bufbmtx);
817 				if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
818 					list_move_tail(&mlcq->mlcq_buffers,
819 					    &mlcq->mlcq_buffers_b);
820 					added = B_TRUE;
821 				}
822 				mutex_exit(&mlcq->mlcq_bufbmtx);
823 				if (added)
824 					goto lookagain;
825 			}
826 			if (!found) {
827 				buf = list_head(&mlcq->mlcq_buffers);
828 				mlxcx_warn(mlxp, "got completion on CQ %x but "
829 				    "no buffer matching wqe found: %x (first "
830 				    "buffer counter = %x)", mlcq->mlcq_num,
831 				    from_be16(cent->mlcqe_wqe_counter),
832 				    buf == NULL ? UINT32_MAX :
833 				    buf->mlb_wqe_index);
834 				mlxcx_fm_ereport(mlxp,
835 				    DDI_FM_DEVICE_INVAL_STATE);
836 				goto nextcq;
837 			}
838 			list_remove(&mlcq->mlcq_buffers, buf);
839 			atomic_dec_64(&mlcq->mlcq_bufcnt);
840 
841 			switch (mlcq->mlcq_wq->mlwq_type) {
842 			case MLXCX_WQ_TYPE_SENDQ:
843 				mlxcx_tx_completion(mlxp, mlcq, cent, buf);
844 				break;
845 			case MLXCX_WQ_TYPE_RECVQ:
846 				nmp = mlxcx_rx_completion(mlxp, mlcq, cent,
847 				    buf);
848 				if (nmp != NULL) {
849 					if (cmp != NULL) {
850 						cmp->b_next = nmp;
851 						cmp = nmp;
852 					} else {
853 						mp = cmp = nmp;
854 					}
855 				}
856 				break;
857 			}
858 
859 nextcq:
860 			/*
861 			 * Update the "doorbell" consumer counter for the queue
862 			 * every time. Unlike a UAR write, this is relatively
863 			 * cheap and doesn't require us to go out on the bus
864 			 * straight away (since it's our memory).
865 			 */
866 			mlcq->mlcq_doorbell->mlcqd_update_ci =
867 			    to_be24(mlcq->mlcq_cc);
868 
869 			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) &&
870 			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
871 				mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC;
872 				tellmac = B_TRUE;
873 			}
874 		}
875 
876 		mlxcx_arm_cq(mlxp, mlcq);
877 		mutex_exit(&mlcq->mlcq_mtx);
878 
879 		if (tellmac) {
880 			mac_tx_ring_update(mlxp->mlx_mac_hdl,
881 			    mlcq->mlcq_mac_hdl);
882 		}
883 		if (mp != NULL) {
884 			mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl,
885 			    mp, mlcq->mlcq_mac_gen);
886 		}
887 
888 		/*
889 		 * Updating the consumer counter for an EQ requires a write
890 		 * to the UAR, which is possibly expensive.
891 		 *
892 		 * Try to do it only often enough to stop us wrapping around.
893 		 */
894 		if ((mleq->mleq_cc & 0x7) == 0)
895 			mlxcx_update_eq(mlxp, mleq);
896 	}
897 
898 	mlxcx_arm_eq(mlxp, mleq);
899 	mutex_exit(&mleq->mleq_mtx);
900 
901 	return (DDI_INTR_CLAIMED);
902 }
903 
904 boolean_t
905 mlxcx_intr_setup(mlxcx_t *mlxp)
906 {
907 	dev_info_t *dip = mlxp->mlx_dip;
908 	int ret;
909 	int nintrs = 0;
910 	int navail = 0;
911 	int types, i;
912 	mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;
913 
914 	ret = ddi_intr_get_supported_types(dip, &types);
915 	if (ret != DDI_SUCCESS) {
916 		return (B_FALSE);
917 	}
918 
919 	if (!(types & DDI_INTR_TYPE_MSIX)) {
920 		mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
921 		    "requires MSI-X");
922 		return (B_FALSE);
923 	}
924 
925 	ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
926 	if (ret != DDI_SUCCESS) {
927 		return (B_FALSE);
928 	}
929 	if (nintrs < 2) {
930 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
931 		    "requires 2", nintrs);
932 		return (B_FALSE);
933 	}
934 
935 	ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
936 	if (navail < 2) {
937 		mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
938 		    "requires 2", navail);
939 		return (B_FALSE);
940 	}
941 
942 	mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
943 	mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
944 
945 	ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
946 	    0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
947 	if (ret != DDI_SUCCESS) {
948 		mlxcx_intr_teardown(mlxp);
949 		return (B_FALSE);
950 	}
951 	if (mlxp->mlx_intr_count < 2) {
952 		mlxcx_intr_teardown(mlxp);
953 		return (B_FALSE);
954 	}
955 	mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;
956 
957 	ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
958 	if (ret != DDI_SUCCESS) {
959 		mlxcx_intr_teardown(mlxp);
960 		return (B_FALSE);
961 	}
962 
963 	mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
964 	    sizeof (mlxcx_event_queue_t);
965 	mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
966 
967 	ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0,
968 	    (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
969 	if (ret != DDI_SUCCESS) {
970 		mlxcx_intr_teardown(mlxp);
971 		return (B_FALSE);
972 	}
973 
974 	/*
975 	 * If we have enough interrupts, set their "type" fields so that we
976 	 * avoid mixing RX and TX queues on the same EQs.
977 	 */
978 	if (mlxp->mlx_intr_count >= 8) {
979 		eqt = MLXCX_EQ_TYPE_RX;
980 	}
981 
982 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
983 		mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
984 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
985 		avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
986 		    sizeof (mlxcx_completion_queue_t),
987 		    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
988 		mlxp->mlx_eqs[i].mleq_intr_index = i;
989 
990 		mlxp->mlx_eqs[i].mleq_type = eqt;
991 		/*
992 		 * If eqt is still ANY, just leave it set to that
993 		 * (no else here).
994 		 */
995 		if (eqt == MLXCX_EQ_TYPE_RX) {
996 			eqt = MLXCX_EQ_TYPE_TX;
997 		} else if (eqt == MLXCX_EQ_TYPE_TX) {
998 			eqt = MLXCX_EQ_TYPE_RX;
999 		}
1000 
1001 		ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
1002 		    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
1003 		if (ret != DDI_SUCCESS) {
1004 			mlxcx_intr_teardown(mlxp);
1005 			return (B_FALSE);
1006 		}
1007 	}
1008 
1009 	return (B_TRUE);
1010 }
1011