xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_gld.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2021, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  * Copyright 2023 MNX Cloud, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/sysmacros.h>
26 #include <sys/vlan.h>
27 
28 #include <sys/pattr.h>
29 #include <sys/dlpi.h>
30 
31 #include <sys/mac_provider.h>
32 
33 /* Need these for mac_vlan_header_info() */
34 #include <sys/mac_client.h>
35 #include <sys/mac_client_priv.h>
36 
37 #include <mlxcx.h>
38 
39 static char *mlxcx_priv_props[] = {
40 	NULL
41 };
42 
43 #define	MBITS		1000000ULL
44 #define	GBITS		(1000ULL * MBITS)
45 
46 static uint64_t
47 mlxcx_speed_to_bits(mlxcx_eth_proto_t proto, mlxcx_ext_eth_proto_t ext_proto)
48 {
49 	/*
50 	 * Older parts only used "proto", but starting with ConnectX-6, there
51 	 * might be speeds & link-types in an extended set of proto bits.
52 	 *
53 	 * We check the old bits first because the extended bits do not report
54 	 * media on links (e.g. nothing like MLXCX_EXTPROTO_100GBASE_CR2
55 	 * for a 50Gbit lane).
56 	 *
57 	 * In the case of, e.g., 100GBASE_CR4 both proto and ext_proto have
58 	 * bits set, but the extended proto bits are a generic CAUI4 indicator
59 	 * that could be for CR4, KR4, etc. If we get a legitimate single-bit
60 	 * value, we don't worry about ext_proto. This may change in the face
61 	 * of other HW or cabling, however.
62 	 */
63 	switch (proto) {
64 	case MLXCX_PROTO_NONE:	/* Aka "0" */
65 		/* Go straight to checking ext_proto. */
66 		break;
67 	case MLXCX_PROTO_SGMII_100BASE:
68 	case MLXCX_PROTO_100BASE_TX:
69 		return (100ULL * MBITS);
70 	case MLXCX_PROTO_SGMII:
71 	case MLXCX_PROTO_1000BASE_KX:
72 	case MLXCX_PROTO_1000BASE_T:
73 		return (1000ULL * MBITS);
74 	case MLXCX_PROTO_10GBASE_CX4:
75 	case MLXCX_PROTO_10GBASE_KX4:
76 	case MLXCX_PROTO_10GBASE_KR:
77 	case MLXCX_PROTO_10GBASE_CR:
78 	case MLXCX_PROTO_10GBASE_SR:
79 	case MLXCX_PROTO_10GBASE_ER_LR:
80 	case MLXCX_PROTO_10GBASE_T:
81 		return (10ULL * GBITS);
82 	case MLXCX_PROTO_40GBASE_CR4:
83 	case MLXCX_PROTO_40GBASE_KR4:
84 	case MLXCX_PROTO_40GBASE_SR4:
85 	case MLXCX_PROTO_40GBASE_LR4_ER4:
86 		return (40ULL * GBITS);
87 	case MLXCX_PROTO_25GBASE_CR:
88 	case MLXCX_PROTO_25GBASE_KR:
89 	case MLXCX_PROTO_25GBASE_SR:
90 		return (25ULL * GBITS);
91 	case MLXCX_PROTO_50GBASE_SR2:
92 	case MLXCX_PROTO_50GBASE_CR2:
93 	case MLXCX_PROTO_50GBASE_KR2:
94 		return (50ULL * GBITS);
95 	case MLXCX_PROTO_100GBASE_CR4:
96 	case MLXCX_PROTO_100GBASE_SR4:
97 	case MLXCX_PROTO_100GBASE_KR4:
98 	case MLXCX_PROTO_100GBASE_LR4_ER4:
99 		return (100ULL * GBITS);
100 	default:
101 		/*
102 		 * We've checked for 0 explicitly above, so don't worry here.
103 		 *
104 		 * There ARE legitimate single-bit values we don't support,
105 		 * and should just return 0 immediately.  We will ASSERT()
106 		 * that it's a single-bit value, however, since the passed-in
107 		 * values are from the "operational" register, which is only
108 		 * supposed to have one bit set. If the assertion fails
109 		 * there's either a hardware error or a severe
110 		 * misunderstanding of the register.
111 		 */
112 		ASSERT0((uint32_t)proto & ((uint32_t)proto - 1U));
113 		return (0);
114 	}
115 
116 	switch (ext_proto) {
117 	case MLXCX_EXTPROTO_SGMII_100BASE:
118 		return (100ULL * MBITS);
119 	case MLXCX_EXTPROTO_1000BASE_X_SGMII:
120 		return (1000ULL * MBITS);
121 	case MLXCX_EXTPROTO_5GBASE_R:
122 		return (5ULL * GBITS);
123 	case MLXCX_EXTPROTO_10GBASE_XFI_XAUI_1:
124 		return (10ULL * GBITS);
125 	case MLXCX_EXTPROTO_40GBASE_XLAUI_4_XLPPI_4:
126 		return (40ULL * GBITS);
127 	case MLXCX_EXTPROTO_25GAUI_1_25GBASE_CR_KR:
128 		return (25ULL * GBITS);
129 	case MLXCX_EXTPROTO_50GAUI_2_LAUI_2_50GBASE_CR2_KR2:
130 	case MLXCX_EXTPROTO_50GAUI_1_LAUI_1_50GBASE_CR_KR:
131 		return (50ULL * GBITS);
132 	case MLXCX_EXTPROTO_CAUI_4_100GBASE_CR4_KR4:
133 	case MLXCX_EXTPROTO_100GAUI_2_100GBASE_CR2_KR2:
134 	case MLXCX_EXTPROTO_100GAUI_1_100GBASE_CR_KR:
135 		return (100ULL * GBITS);
136 	case MLXCX_EXTPROTO_200GAUI_4_200GBASE_CR4_KR4:
137 	case MLXCX_EXTPROTO_200GAUI_2_200GBASE_CR2_KR2:
138 		return (200ULL * GBITS);
139 	case MLXCX_EXTPROTO_400GAUI_8_400GBASE_CR8:
140 	case MLXCX_EXTPROTO_400GAUI_4_400GBASE_CR4:
141 		return (400ULL * GBITS);
142 	default:
143 		/*
144 		 * There ARE legitimate single-bit values we don't support,
145 		 * and should just return 0 immediately.  We will ASSERT()
146 		 * that it's a single-bit value, however, for reasons detailed
147 		 * in the prior `default` case.
148 		 */
149 		ASSERT0((uint32_t)ext_proto & ((uint32_t)ext_proto - 1U));
150 		break;
151 	}
152 
153 	return (0);
154 }
155 
156 static link_fec_t
157 mlxcx_fec_to_link_fec(mlxcx_pplm_fec_active_t mlxcx_fec)
158 {
159 	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_NONE) != 0)
160 		return (LINK_FEC_NONE);
161 
162 	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_FIRECODE) != 0)
163 		return (LINK_FEC_BASE_R);
164 
165 	if ((mlxcx_fec & (MLXCX_PPLM_FEC_ACTIVE_RS528 |
166 	    MLXCX_PPLM_FEC_ACTIVE_RS271 | MLXCX_PPLM_FEC_ACTIVE_RS544 |
167 	    MLXCX_PPLM_FEC_ACTIVE_RS272)) != 0)
168 		return (LINK_FEC_RS);
169 
170 	return (LINK_FEC_NONE);
171 }
172 
173 static boolean_t
174 mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp)
175 {
176 	mlxcx_pplm_fec_caps_t pplm_fec = 0;
177 
178 	if ((fec & LINK_FEC_AUTO) != 0) {
179 		pplm_fec = MLXCX_PPLM_FEC_CAP_AUTO;
180 		fec &= ~LINK_FEC_AUTO;
181 	} else if ((fec & LINK_FEC_NONE) != 0) {
182 		pplm_fec = MLXCX_PPLM_FEC_CAP_NONE;
183 		fec &= ~LINK_FEC_NONE;
184 	} else if ((fec & LINK_FEC_RS) != 0) {
185 		pplm_fec |= MLXCX_PPLM_FEC_CAP_RS;
186 		fec &= ~LINK_FEC_RS;
187 	} else if ((fec & LINK_FEC_BASE_R) != 0) {
188 		pplm_fec |= MLXCX_PPLM_FEC_CAP_FIRECODE;
189 		fec &= ~LINK_FEC_BASE_R;
190 	}
191 
192 	/*
193 	 * Only one fec option is allowed.
194 	 */
195 	if (fec != 0)
196 		return (B_FALSE);
197 
198 	*pfecp = pplm_fec;
199 
200 	return (B_TRUE);
201 }
202 
203 static int
204 mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat,
205     uint64_t *val)
206 {
207 	int ret = 0;
208 	boolean_t ok;
209 	mlxcx_register_data_t data;
210 	mlxcx_ppcnt_rfc_2863_t *st;
211 
212 	ASSERT(mutex_owned(&port->mlp_mtx));
213 
214 	bzero(&data, sizeof (data));
215 	data.mlrd_ppcnt.mlrd_ppcnt_local_port = port->mlp_num + 1;
216 	data.mlrd_ppcnt.mlrd_ppcnt_grp = MLXCX_PPCNT_GRP_RFC_2863;
217 	data.mlrd_ppcnt.mlrd_ppcnt_clear = MLXCX_PPCNT_NO_CLEAR;
218 
219 	ok = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
220 	    MLXCX_REG_PPCNT, &data);
221 	if (!ok)
222 		return (EIO);
223 	st = &data.mlrd_ppcnt.mlrd_ppcnt_rfc_2863;
224 
225 	switch (stat) {
226 	case MAC_STAT_RBYTES:
227 		*val = from_be64(st->mlppc_rfc_2863_in_octets);
228 		break;
229 	case MAC_STAT_MULTIRCV:
230 		*val = from_be64(st->mlppc_rfc_2863_in_mcast_pkts);
231 		break;
232 	case MAC_STAT_BRDCSTRCV:
233 		*val = from_be64(st->mlppc_rfc_2863_in_bcast_pkts);
234 		break;
235 	case MAC_STAT_MULTIXMT:
236 		*val = from_be64(st->mlppc_rfc_2863_out_mcast_pkts);
237 		break;
238 	case MAC_STAT_BRDCSTXMT:
239 		*val = from_be64(st->mlppc_rfc_2863_out_bcast_pkts);
240 		break;
241 	case MAC_STAT_IERRORS:
242 		*val = from_be64(st->mlppc_rfc_2863_in_errors);
243 		break;
244 	case MAC_STAT_UNKNOWNS:
245 		*val = from_be64(st->mlppc_rfc_2863_in_unknown_protos);
246 		break;
247 	case MAC_STAT_OERRORS:
248 		*val = from_be64(st->mlppc_rfc_2863_out_errors);
249 		break;
250 	case MAC_STAT_OBYTES:
251 		*val = from_be64(st->mlppc_rfc_2863_out_octets);
252 		break;
253 	default:
254 		ret = ENOTSUP;
255 	}
256 
257 	return (ret);
258 }
259 
260 static int
261 mlxcx_mac_stat_ieee_802_3(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat,
262     uint64_t *val)
263 {
264 	int ret = 0;
265 	boolean_t ok;
266 	mlxcx_register_data_t data;
267 	mlxcx_ppcnt_ieee_802_3_t *st;
268 
269 	ASSERT(mutex_owned(&port->mlp_mtx));
270 
271 	bzero(&data, sizeof (data));
272 	data.mlrd_ppcnt.mlrd_ppcnt_local_port = port->mlp_num + 1;
273 	data.mlrd_ppcnt.mlrd_ppcnt_grp = MLXCX_PPCNT_GRP_IEEE_802_3;
274 	data.mlrd_ppcnt.mlrd_ppcnt_clear = MLXCX_PPCNT_NO_CLEAR;
275 
276 	ok = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
277 	    MLXCX_REG_PPCNT, &data);
278 	if (!ok)
279 		return (EIO);
280 	st = &data.mlrd_ppcnt.mlrd_ppcnt_ieee_802_3;
281 
282 	switch (stat) {
283 	case MAC_STAT_IPACKETS:
284 		*val = from_be64(st->mlppc_ieee_802_3_frames_rx);
285 		break;
286 	case MAC_STAT_OPACKETS:
287 		*val = from_be64(st->mlppc_ieee_802_3_frames_tx);
288 		break;
289 	case ETHER_STAT_ALIGN_ERRORS:
290 		*val = from_be64(st->mlppc_ieee_802_3_align_err);
291 		break;
292 	case ETHER_STAT_FCS_ERRORS:
293 		*val = from_be64(st->mlppc_ieee_802_3_fcs_err);
294 		break;
295 	case ETHER_STAT_TOOLONG_ERRORS:
296 		*val = from_be64(st->mlppc_ieee_802_3_frame_too_long_err);
297 		break;
298 	default:
299 		ret = ENOTSUP;
300 	}
301 
302 	return (ret);
303 }
304 
305 static int
306 mlxcx_mac_stat(void *arg, uint_t stat, uint64_t *val)
307 {
308 	mlxcx_t *mlxp = (mlxcx_t *)arg;
309 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
310 	int ret = 0;
311 
312 	mutex_enter(&port->mlp_mtx);
313 
314 	switch (stat) {
315 	case MAC_STAT_IFSPEED:
316 		*val = mlxcx_speed_to_bits(port->mlp_oper_proto,
317 		    port->mlp_ext_oper_proto);
318 		break;
319 	case ETHER_STAT_LINK_DUPLEX:
320 		*val = LINK_DUPLEX_FULL;
321 		break;
322 	case MAC_STAT_RBYTES:
323 	case MAC_STAT_MULTIRCV:
324 	case MAC_STAT_BRDCSTRCV:
325 	case MAC_STAT_MULTIXMT:
326 	case MAC_STAT_BRDCSTXMT:
327 	case MAC_STAT_IERRORS:
328 	case MAC_STAT_UNKNOWNS:
329 	case MAC_STAT_OERRORS:
330 	case MAC_STAT_OBYTES:
331 		ret = mlxcx_mac_stat_rfc_2863(mlxp, port, stat, val);
332 		break;
333 	case MAC_STAT_IPACKETS:
334 	case MAC_STAT_OPACKETS:
335 	case ETHER_STAT_ALIGN_ERRORS:
336 	case ETHER_STAT_FCS_ERRORS:
337 	case ETHER_STAT_TOOLONG_ERRORS:
338 		ret = mlxcx_mac_stat_ieee_802_3(mlxp, port, stat, val);
339 		break;
340 	case MAC_STAT_NORCVBUF:
341 		*val = port->mlp_stats.mlps_rx_drops;
342 		break;
343 	default:
344 		ret = ENOTSUP;
345 	}
346 
347 	mutex_exit(&port->mlp_mtx);
348 
349 	return (ret);
350 }
351 
352 static int
353 mlxcx_mac_led_set(void *arg, mac_led_mode_t mode, uint_t flags)
354 {
355 	mlxcx_t *mlxp = arg;
356 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
357 	int ret = 0;
358 
359 	if (flags != 0) {
360 		return (EINVAL);
361 	}
362 
363 	mutex_enter(&port->mlp_mtx);
364 
365 	switch (mode) {
366 	case MAC_LED_DEFAULT:
367 	case MAC_LED_OFF:
368 		if (!mlxcx_cmd_set_port_led(mlxp, port, 0)) {
369 			ret = EIO;
370 			break;
371 		}
372 		break;
373 	case MAC_LED_IDENT:
374 		if (!mlxcx_cmd_set_port_led(mlxp, port, UINT16_MAX)) {
375 			ret = EIO;
376 			break;
377 		}
378 		break;
379 	default:
380 		ret = ENOTSUP;
381 	}
382 
383 	mutex_exit(&port->mlp_mtx);
384 
385 	return (ret);
386 }
387 
388 static int
389 mlxcx_mac_txr_info(void *arg, uint_t id, mac_transceiver_info_t *infop)
390 {
391 	mlxcx_t *mlxp = arg;
392 	mlxcx_module_status_t st;
393 
394 	if (!mlxcx_cmd_query_module_status(mlxp, id, &st, NULL))
395 		return (EIO);
396 
397 	if (st != MLXCX_MODULE_UNPLUGGED)
398 		mac_transceiver_info_set_present(infop, B_TRUE);
399 
400 	if (st == MLXCX_MODULE_PLUGGED)
401 		mac_transceiver_info_set_usable(infop, B_TRUE);
402 
403 	return (0);
404 }
405 
406 static int
407 mlxcx_mac_txr_read(void *arg, uint_t id, uint_t page, void *vbuf,
408     size_t nbytes, off_t offset, size_t *nread)
409 {
410 	mlxcx_t *mlxp = arg;
411 	mlxcx_register_data_t data;
412 	uint8_t *buf = vbuf;
413 	boolean_t ok;
414 	size_t take, done = 0;
415 	uint8_t i2c_addr;
416 
417 	if (id != 0 || vbuf == NULL || nbytes == 0 || nread == NULL)
418 		return (EINVAL);
419 
420 	if (nbytes > 256 || offset >= 256 || (offset + nbytes > 256))
421 		return (EINVAL);
422 
423 	/*
424 	 * The PRM is really not very clear about any of this, but it seems
425 	 * that the i2c_device_addr field in MCIA is the SFP+ spec "page"
426 	 * number shifted right by 1 bit. They're written in the SFF spec
427 	 * like "1010000X" so Mellanox just dropped the X.
428 	 *
429 	 * This means that if we want page 0xA0, we put 0x50 in the
430 	 * i2c_device_addr field.
431 	 *
432 	 * The "page_number" field in MCIA means something else. Don't ask me
433 	 * what. FreeBSD leaves it as zero, so we will too!
434 	 */
435 	i2c_addr = page >> 1;
436 
437 	while (done < nbytes) {
438 		take = nbytes - done;
439 		if (take > sizeof (data.mlrd_mcia.mlrd_mcia_data))
440 			take = sizeof (data.mlrd_mcia.mlrd_mcia_data);
441 
442 		bzero(&data, sizeof (data));
443 		ASSERT3U(id, <=, 0xff);
444 		data.mlrd_mcia.mlrd_mcia_module = (uint8_t)id;
445 		data.mlrd_mcia.mlrd_mcia_i2c_device_addr = i2c_addr;
446 		data.mlrd_mcia.mlrd_mcia_device_addr = to_be16(offset);
447 		data.mlrd_mcia.mlrd_mcia_size = to_be16(take);
448 
449 		ok = mlxcx_cmd_access_register(mlxp,
450 		    MLXCX_CMD_ACCESS_REGISTER_READ, MLXCX_REG_MCIA, &data);
451 		if (!ok) {
452 			*nread = 0;
453 			return (EIO);
454 		}
455 
456 		if (data.mlrd_mcia.mlrd_mcia_status != MLXCX_MCIA_STATUS_OK) {
457 			*nread = 0;
458 			return (EIO);
459 		}
460 
461 		bcopy(data.mlrd_mcia.mlrd_mcia_data, &buf[done], take);
462 
463 		done += take;
464 		offset += take;
465 	}
466 	*nread = done;
467 	return (0);
468 }
469 
470 static int
471 mlxcx_mac_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
472 {
473 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
474 	(void) wq;
475 
476 	/*
477 	 * We should add support for using hw flow counters and such to
478 	 * get per-ring statistics. Not done yet though!
479 	 */
480 
481 	switch (stat) {
482 	default:
483 		*val = 0;
484 		return (ENOTSUP);
485 	}
486 
487 	return (0);
488 }
489 
490 static int
491 mlxcx_mac_start(void *arg)
492 {
493 	mlxcx_t *mlxp = (mlxcx_t *)arg;
494 	(void) mlxp;
495 	return (0);
496 }
497 
498 static void
499 mlxcx_mac_stop(void *arg)
500 {
501 	mlxcx_t *mlxp = (mlxcx_t *)arg;
502 	(void) mlxp;
503 }
504 
505 static mblk_t *
506 mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
507 {
508 	mlxcx_work_queue_t *sq = (mlxcx_work_queue_t *)arg;
509 	mlxcx_t *mlxp = sq->mlwq_mlx;
510 	mlxcx_completion_queue_t *cq;
511 	mlxcx_buffer_t *b;
512 	mac_header_info_t mhi;
513 	mblk_t *kmp, *nmp;
514 	uint8_t inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN];
515 	size_t inline_hdrlen, rem, off;
516 	uint32_t chkflags = 0;
517 	boolean_t ok;
518 	size_t take = 0;
519 	uint_t bcount;
520 
521 	VERIFY(mp->b_next == NULL);
522 
523 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &chkflags);
524 
525 	if (mac_vlan_header_info(mlxp->mlx_mac_hdl, mp, &mhi) != 0) {
526 		/*
527 		 * We got given a frame without a valid L2 header on it. We
528 		 * can't really transmit that (mlx parts don't like it), so
529 		 * we will just drop it on the floor.
530 		 */
531 		freemsg(mp);
532 		return (NULL);
533 	}
534 
535 	inline_hdrlen = rem = mhi.mhi_hdrsize;
536 
537 	kmp = mp;
538 	off = 0;
539 	while (rem > 0) {
540 		const ptrdiff_t sz = MBLKL(kmp);
541 		ASSERT3S(sz, >=, 0);
542 		ASSERT3U(sz, <=, SIZE_MAX);
543 		take = sz;
544 		if (take > rem)
545 			take = rem;
546 		bcopy(kmp->b_rptr, inline_hdrs + off, take);
547 		rem -= take;
548 		off += take;
549 		if (take == sz) {
550 			take = 0;
551 			kmp = kmp->b_cont;
552 		}
553 	}
554 
555 	bcount = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b);
556 	if (bcount == 0) {
557 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
558 		return (mp);
559 	}
560 
561 	mutex_enter(&sq->mlwq_mtx);
562 	VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2);
563 	cq = sq->mlwq_cq;
564 
565 	/*
566 	 * state is a single int, so read-only access without the CQ lock
567 	 * should be fine.
568 	 */
569 	if (cq->mlcq_state & MLXCX_CQ_TEARDOWN) {
570 		mutex_exit(&sq->mlwq_mtx);
571 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
572 		return (NULL);
573 	}
574 
575 	if ((sq->mlwq_state & (MLXCX_WQ_TEARDOWN | MLXCX_WQ_STARTED)) !=
576 	    MLXCX_WQ_STARTED) {
577 		mutex_exit(&sq->mlwq_mtx);
578 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
579 		return (NULL);
580 	}
581 
582 	/*
583 	 * If the completion queue buffer count is already at or above
584 	 * the high water mark, or the addition of this new chain will
585 	 * exceed the CQ ring size, then indicate we are blocked.
586 	 */
587 	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm ||
588 	    (cq->mlcq_bufcnt + bcount) > cq->mlcq_nents) {
589 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
590 		goto blocked;
591 	}
592 
593 	if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
594 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
595 		goto blocked;
596 	}
597 
598 	ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
599 	    chkflags, b);
600 	if (!ok) {
601 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
602 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
603 		goto blocked;
604 	}
605 
606 	/*
607 	 * Now that we've successfully enqueued the rest of the packet,
608 	 * free any mblks that we cut off while inlining headers.
609 	 */
610 	for (; mp != kmp; mp = nmp) {
611 		nmp = mp->b_cont;
612 		freeb(mp);
613 	}
614 
615 	mutex_exit(&sq->mlwq_mtx);
616 
617 	return (NULL);
618 
619 blocked:
620 	mutex_exit(&sq->mlwq_mtx);
621 	mlxcx_buf_return_chain(mlxp, b, B_TRUE);
622 	return (mp);
623 }
624 
625 static int
626 mlxcx_mac_setpromisc(void *arg, boolean_t on)
627 {
628 	mlxcx_t *mlxp = (mlxcx_t *)arg;
629 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
630 	mlxcx_flow_group_t *fg;
631 	mlxcx_flow_entry_t *fe;
632 	mlxcx_flow_table_t *ft;
633 	mlxcx_ring_group_t *g;
634 	int ret = 0;
635 	uint_t idx;
636 
637 	mutex_enter(&port->mlp_mtx);
638 
639 	/*
640 	 * First, do the top-level flow entry on the root flow table for
641 	 * the port. This catches all traffic that doesn't match any MAC
642 	 * MAC filters.
643 	 */
644 	ft = port->mlp_rx_flow;
645 	mutex_enter(&ft->mlft_mtx);
646 	fg = port->mlp_promisc;
647 	fe = list_head(&fg->mlfg_entries);
648 	if (on && !(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
649 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
650 			ret = EIO;
651 		}
652 	} else if (!on && (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
653 		if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
654 			ret = EIO;
655 		}
656 	}
657 	mutex_exit(&ft->mlft_mtx);
658 
659 	/*
660 	 * If we failed to change the top-level entry, don't bother with
661 	 * trying the per-group ones.
662 	 */
663 	if (ret != 0) {
664 		mutex_exit(&port->mlp_mtx);
665 		return (ret);
666 	}
667 
668 	/*
669 	 * Then, do the per-rx-group flow entries which catch traffic that
670 	 * matched a MAC filter but failed to match a VLAN filter.
671 	 */
672 	for (idx = 0; idx < mlxp->mlx_rx_ngroups; ++idx) {
673 		g = &mlxp->mlx_rx_groups[idx];
674 
675 		mutex_enter(&g->mlg_mtx);
676 
677 		ft = g->mlg_rx_vlan_ft;
678 		mutex_enter(&ft->mlft_mtx);
679 
680 		fg = g->mlg_rx_vlan_promisc_fg;
681 		fe = list_head(&fg->mlfg_entries);
682 		if (on && !(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
683 			if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
684 				ret = EIO;
685 			}
686 		} else if (!on && (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
687 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
688 				ret = EIO;
689 			}
690 		}
691 
692 		mutex_exit(&ft->mlft_mtx);
693 		mutex_exit(&g->mlg_mtx);
694 	}
695 
696 	mutex_exit(&port->mlp_mtx);
697 	return (ret);
698 }
699 
700 static int
701 mlxcx_mac_multicast(void *arg, boolean_t add, const uint8_t *addr)
702 {
703 	mlxcx_t *mlxp = (mlxcx_t *)arg;
704 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
705 	mlxcx_ring_group_t *g = &mlxp->mlx_rx_groups[0];
706 	int ret = 0;
707 
708 	mutex_enter(&port->mlp_mtx);
709 	mutex_enter(&g->mlg_mtx);
710 	if (add) {
711 		if (!mlxcx_add_umcast_entry(mlxp, port, g, addr)) {
712 			ret = EIO;
713 		}
714 	} else {
715 		if (!mlxcx_remove_umcast_entry(mlxp, port, g, addr)) {
716 			ret = EIO;
717 		}
718 	}
719 	mutex_exit(&g->mlg_mtx);
720 	mutex_exit(&port->mlp_mtx);
721 	return (ret);
722 }
723 
724 static int
725 mlxcx_group_add_mac(void *arg, const uint8_t *mac_addr)
726 {
727 	mlxcx_ring_group_t *g = arg;
728 	mlxcx_t *mlxp = g->mlg_mlx;
729 	mlxcx_port_t *port = g->mlg_port;
730 	int ret = 0;
731 
732 	mutex_enter(&port->mlp_mtx);
733 	mutex_enter(&g->mlg_mtx);
734 	if (!mlxcx_add_umcast_entry(mlxp, port, g, mac_addr)) {
735 		ret = EIO;
736 	}
737 	mutex_exit(&g->mlg_mtx);
738 	mutex_exit(&port->mlp_mtx);
739 
740 	return (ret);
741 }
742 
743 static int
744 mlxcx_group_add_vlan(mac_group_driver_t gh, uint16_t vid)
745 {
746 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
747 	mlxcx_t *mlxp = g->mlg_mlx;
748 	int ret = 0;
749 	boolean_t tagged = B_TRUE;
750 
751 	if (vid == MAC_VLAN_UNTAGGED) {
752 		vid = 0;
753 		tagged = B_FALSE;
754 	}
755 
756 	mutex_enter(&g->mlg_mtx);
757 	if (!mlxcx_add_vlan_entry(mlxp, g, tagged, vid)) {
758 		ret = EIO;
759 	}
760 	mutex_exit(&g->mlg_mtx);
761 
762 	return (ret);
763 }
764 
765 static int
766 mlxcx_group_remove_vlan(mac_group_driver_t gh, uint16_t vid)
767 {
768 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
769 	mlxcx_t *mlxp = g->mlg_mlx;
770 	int ret = 0;
771 	boolean_t tagged = B_TRUE;
772 
773 	if (vid == MAC_VLAN_UNTAGGED) {
774 		vid = 0;
775 		tagged = B_FALSE;
776 	}
777 
778 	mutex_enter(&g->mlg_mtx);
779 	if (!mlxcx_remove_vlan_entry(mlxp, g, tagged, vid)) {
780 		ret = EIO;
781 	}
782 	mutex_exit(&g->mlg_mtx);
783 
784 	return (ret);
785 }
786 
787 static int
788 mlxcx_group_remove_mac(void *arg, const uint8_t *mac_addr)
789 {
790 	mlxcx_ring_group_t *g = arg;
791 	mlxcx_t *mlxp = g->mlg_mlx;
792 	mlxcx_port_t *port = g->mlg_port;
793 	int ret = 0;
794 
795 	mutex_enter(&port->mlp_mtx);
796 	mutex_enter(&g->mlg_mtx);
797 	if (!mlxcx_remove_umcast_entry(mlxp, port, g, mac_addr)) {
798 		ret = EIO;
799 	}
800 	mutex_exit(&g->mlg_mtx);
801 	mutex_exit(&port->mlp_mtx);
802 
803 	return (ret);
804 }
805 
806 static int
807 mlxcx_mac_ring_start(mac_ring_driver_t rh, uint64_t gen_num)
808 {
809 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
810 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
811 	mlxcx_ring_group_t *g = wq->mlwq_group;
812 	mlxcx_t *mlxp = wq->mlwq_mlx;
813 
814 	ASSERT(cq != NULL);
815 	ASSERT(g != NULL);
816 
817 	ASSERT(wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ ||
818 	    wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ);
819 	if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
820 	    !mlxcx_tx_ring_start(mlxp, g, wq))
821 		return (EIO);
822 	if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
823 	    !mlxcx_rx_ring_start(mlxp, g, wq))
824 		return (EIO);
825 
826 	mutex_enter(&cq->mlcq_mtx);
827 	cq->mlcq_mac_gen = gen_num;
828 	mutex_exit(&cq->mlcq_mtx);
829 
830 	return (0);
831 }
832 
833 static void
834 mlxcx_mac_ring_stop(mac_ring_driver_t rh)
835 {
836 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
837 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
838 	mlxcx_t *mlxp = wq->mlwq_mlx;
839 	mlxcx_buf_shard_t *s;
840 	mlxcx_buffer_t *buf;
841 
842 	/*
843 	 * To prevent deadlocks and sleeping whilst holding either the
844 	 * CQ mutex or WQ mutex, we split the stop processing into two
845 	 * parts.
846 	 *
847 	 * With the CQ amd WQ mutexes held the appropriate WQ is stopped.
848 	 * The Q in the HCA is set to Reset state and flagged as no
849 	 * longer started. Atomic with changing this WQ state, the buffer
850 	 * shards are flagged as draining.
851 	 *
852 	 * Now, any requests for buffers and attempts to submit messages
853 	 * will fail and once we're in this state it is safe to relinquish
854 	 * the CQ and WQ mutexes. Allowing us to complete the ring stop
855 	 * by waiting for the buffer lists, with the exception of
856 	 * the loaned list, to drain. Buffers on the loaned list are
857 	 * not under our control, we will get them back when the mblk tied
858 	 * to the buffer is freed.
859 	 */
860 
861 	mutex_enter(&cq->mlcq_mtx);
862 	mutex_enter(&wq->mlwq_mtx);
863 
864 	if (wq->mlwq_state & MLXCX_WQ_STARTED) {
865 		if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
866 		    !mlxcx_cmd_stop_rq(mlxp, wq)) {
867 			mutex_exit(&wq->mlwq_mtx);
868 			mutex_exit(&cq->mlcq_mtx);
869 			return;
870 		}
871 		if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
872 		    !mlxcx_cmd_stop_sq(mlxp, wq)) {
873 			mutex_exit(&wq->mlwq_mtx);
874 			mutex_exit(&cq->mlcq_mtx);
875 			return;
876 		}
877 	}
878 	ASSERT0(wq->mlwq_state & MLXCX_WQ_STARTED);
879 
880 	mlxcx_shard_draining(wq->mlwq_bufs);
881 	if (wq->mlwq_foreign_bufs != NULL)
882 		mlxcx_shard_draining(wq->mlwq_foreign_bufs);
883 
884 
885 	if (wq->mlwq_state & MLXCX_WQ_BUFFERS) {
886 		list_t cq_buffers;
887 
888 		/*
889 		 * Take the buffers away from the CQ. If the CQ is being
890 		 * processed and the WQ has been stopped, a completion
891 		 * which does not match to a buffer will be ignored.
892 		 */
893 		list_create(&cq_buffers, sizeof (mlxcx_buffer_t),
894 		    offsetof(mlxcx_buffer_t, mlb_cq_entry));
895 
896 		list_move_tail(&cq_buffers, &cq->mlcq_buffers);
897 
898 		mutex_enter(&cq->mlcq_bufbmtx);
899 		list_move_tail(&cq_buffers, &cq->mlcq_buffers_b);
900 		mutex_exit(&cq->mlcq_bufbmtx);
901 
902 		cq->mlcq_bufcnt = 0;
903 
904 		mutex_exit(&wq->mlwq_mtx);
905 		mutex_exit(&cq->mlcq_mtx);
906 
907 		/* Return any outstanding buffers to the free pool. */
908 		while ((buf = list_remove_head(&cq_buffers)) != NULL) {
909 			mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
910 		}
911 		list_destroy(&cq_buffers);
912 
913 		s = wq->mlwq_bufs;
914 		mutex_enter(&s->mlbs_mtx);
915 		while (!list_is_empty(&s->mlbs_busy))
916 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
917 		while ((buf = list_head(&s->mlbs_free)) != NULL) {
918 			mlxcx_buf_destroy(mlxp, buf);
919 		}
920 		mutex_exit(&s->mlbs_mtx);
921 
922 		s = wq->mlwq_foreign_bufs;
923 		if (s != NULL) {
924 			mutex_enter(&s->mlbs_mtx);
925 			while (!list_is_empty(&s->mlbs_busy))
926 				cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
927 			while ((buf = list_head(&s->mlbs_free)) != NULL) {
928 				mlxcx_buf_destroy(mlxp, buf);
929 			}
930 			mutex_exit(&s->mlbs_mtx);
931 		}
932 
933 		mutex_enter(&wq->mlwq_mtx);
934 		wq->mlwq_state &= ~MLXCX_WQ_BUFFERS;
935 		mutex_exit(&wq->mlwq_mtx);
936 	} else {
937 		mutex_exit(&wq->mlwq_mtx);
938 		mutex_exit(&cq->mlcq_mtx);
939 	}
940 }
941 
942 static int
943 mlxcx_mac_group_start(mac_group_driver_t gh)
944 {
945 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
946 	mlxcx_t *mlxp = g->mlg_mlx;
947 
948 	VERIFY3S(g->mlg_type, ==, MLXCX_GROUP_RX);
949 	ASSERT(mlxp != NULL);
950 
951 	if (g->mlg_state & MLXCX_GROUP_RUNNING)
952 		return (0);
953 
954 	if (!mlxcx_rx_group_start(mlxp, g))
955 		return (EIO);
956 
957 	return (0);
958 }
959 
960 static void
961 mlxcx_mac_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
962     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
963 {
964 	mlxcx_t *mlxp = (mlxcx_t *)arg;
965 	mlxcx_ring_group_t *g;
966 	mlxcx_work_queue_t *wq;
967 	mac_intr_t *mintr = &infop->mri_intr;
968 
969 	if (rtype != MAC_RING_TYPE_TX)
970 		return;
971 	ASSERT3S(group_index, ==, -1);
972 
973 	g = &mlxp->mlx_tx_groups[0];
974 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
975 	mutex_enter(&g->mlg_mtx);
976 
977 	ASSERT3S(ring_index, >=, 0);
978 	ASSERT3S(ring_index, <, g->mlg_nwqs);
979 
980 	wq = &g->mlg_wqs[ring_index];
981 
982 	wq->mlwq_cq->mlcq_mac_hdl = rh;
983 
984 	infop->mri_driver = (mac_ring_driver_t)wq;
985 	infop->mri_start = mlxcx_mac_ring_start;
986 	infop->mri_stop = mlxcx_mac_ring_stop;
987 	infop->mri_tx = mlxcx_mac_ring_tx;
988 	infop->mri_stat = mlxcx_mac_ring_stat;
989 
990 	mintr->mi_ddi_handle = mlxp->mlx_intr_handles[
991 	    wq->mlwq_cq->mlcq_eq->mleq_intr_index];
992 
993 	mutex_exit(&g->mlg_mtx);
994 }
995 
996 static int
997 mlxcx_mac_ring_intr_enable(mac_intr_handle_t intrh)
998 {
999 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
1000 	mlxcx_t *mlxp = cq->mlcq_mlx;
1001 
1002 	/*
1003 	 * We are going to call mlxcx_arm_cq() here, so we take the arm lock
1004 	 * as well as the CQ one to make sure we don't race against
1005 	 * mlxcx_intr_n().
1006 	 */
1007 	mutex_enter(&cq->mlcq_arm_mtx);
1008 	mutex_enter(&cq->mlcq_mtx);
1009 	if (cq->mlcq_state & MLXCX_CQ_POLLING) {
1010 		atomic_and_uint(&cq->mlcq_state, ~MLXCX_CQ_POLLING);
1011 		if (!(cq->mlcq_state & MLXCX_CQ_ARMED))
1012 			mlxcx_arm_cq(mlxp, cq);
1013 	}
1014 	mutex_exit(&cq->mlcq_mtx);
1015 	mutex_exit(&cq->mlcq_arm_mtx);
1016 
1017 	return (0);
1018 }
1019 
1020 static int
1021 mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
1022 {
1023 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
1024 
1025 	mutex_enter(&cq->mlcq_mtx);
1026 	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
1027 	mutex_exit(&cq->mlcq_mtx);
1028 
1029 	return (0);
1030 }
1031 
1032 static mblk_t *
1033 mlxcx_mac_ring_rx_poll(void *arg, int poll_bytes)
1034 {
1035 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)arg;
1036 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
1037 	mlxcx_t *mlxp = wq->mlwq_mlx;
1038 	mblk_t *mp;
1039 
1040 	ASSERT(cq != NULL);
1041 	ASSERT3S(poll_bytes, >, 0);
1042 	if (poll_bytes == 0)
1043 		return (NULL);
1044 
1045 	mutex_enter(&cq->mlcq_mtx);
1046 	mp = mlxcx_rx_poll(mlxp, cq, poll_bytes);
1047 	mutex_exit(&cq->mlcq_mtx);
1048 
1049 	return (mp);
1050 }
1051 
1052 static void
1053 mlxcx_mac_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
1054     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
1055 {
1056 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1057 	mlxcx_ring_group_t *g;
1058 	mlxcx_work_queue_t *wq;
1059 	mac_intr_t *mintr = &infop->mri_intr;
1060 
1061 	if (rtype != MAC_RING_TYPE_RX)
1062 		return;
1063 	ASSERT3S(group_index, >=, 0);
1064 	ASSERT3S(group_index, <, mlxp->mlx_rx_ngroups);
1065 
1066 	g = &mlxp->mlx_rx_groups[group_index];
1067 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
1068 	mutex_enter(&g->mlg_mtx);
1069 
1070 	ASSERT3S(ring_index, >=, 0);
1071 	ASSERT3S(ring_index, <, g->mlg_nwqs);
1072 
1073 	ASSERT(g->mlg_state & MLXCX_GROUP_WQS);
1074 	wq = &g->mlg_wqs[ring_index];
1075 
1076 	wq->mlwq_cq->mlcq_mac_hdl = rh;
1077 
1078 	infop->mri_driver = (mac_ring_driver_t)wq;
1079 	infop->mri_start = mlxcx_mac_ring_start;
1080 	infop->mri_stop = mlxcx_mac_ring_stop;
1081 	infop->mri_poll = mlxcx_mac_ring_rx_poll;
1082 	infop->mri_stat = mlxcx_mac_ring_stat;
1083 
1084 	mintr->mi_handle = (mac_intr_handle_t)wq->mlwq_cq;
1085 	mintr->mi_enable = mlxcx_mac_ring_intr_enable;
1086 	mintr->mi_disable = mlxcx_mac_ring_intr_disable;
1087 
1088 	mintr->mi_ddi_handle = mlxp->mlx_intr_handles[
1089 	    wq->mlwq_cq->mlcq_eq->mleq_intr_index];
1090 
1091 	mutex_exit(&g->mlg_mtx);
1092 }
1093 
1094 static void
1095 mlxcx_mac_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
1096     mac_group_info_t *infop, mac_group_handle_t gh)
1097 {
1098 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1099 	mlxcx_ring_group_t *g;
1100 
1101 	if (rtype != MAC_RING_TYPE_RX)
1102 		return;
1103 
1104 	ASSERT3S(index, >=, 0);
1105 	ASSERT3S(index, <, mlxp->mlx_rx_ngroups);
1106 	g = &mlxp->mlx_rx_groups[index];
1107 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
1108 
1109 	g->mlg_mac_hdl = gh;
1110 
1111 	infop->mgi_driver = (mac_group_driver_t)g;
1112 	infop->mgi_start = mlxcx_mac_group_start;
1113 	infop->mgi_stop = NULL;
1114 	infop->mgi_addmac = mlxcx_group_add_mac;
1115 	infop->mgi_remmac = mlxcx_group_remove_mac;
1116 	infop->mgi_addvlan = mlxcx_group_add_vlan;
1117 	infop->mgi_remvlan = mlxcx_group_remove_vlan;
1118 
1119 	infop->mgi_count = g->mlg_nwqs;
1120 }
1121 
1122 static boolean_t
1123 mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1124 {
1125 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1126 	mac_capab_rings_t *cap_rings;
1127 	mac_capab_led_t *cap_leds;
1128 	mac_capab_transceiver_t *cap_txr;
1129 	uint_t i, n = 0;
1130 
1131 	switch (cap) {
1132 
1133 	case MAC_CAPAB_RINGS:
1134 		cap_rings = cap_data;
1135 		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
1136 		switch (cap_rings->mr_type) {
1137 		case MAC_RING_TYPE_TX:
1138 			cap_rings->mr_gnum = 0;
1139 			cap_rings->mr_rnum = mlxp->mlx_tx_groups[0].mlg_nwqs;
1140 			cap_rings->mr_rget = mlxcx_mac_fill_tx_ring;
1141 			cap_rings->mr_gget = NULL;
1142 			cap_rings->mr_gaddring = NULL;
1143 			cap_rings->mr_gremring = NULL;
1144 			break;
1145 		case MAC_RING_TYPE_RX:
1146 			cap_rings->mr_gnum = mlxp->mlx_rx_ngroups;
1147 			for (i = 0; i < mlxp->mlx_rx_ngroups; ++i)
1148 				n += mlxp->mlx_rx_groups[i].mlg_nwqs;
1149 			cap_rings->mr_rnum = n;
1150 			cap_rings->mr_rget = mlxcx_mac_fill_rx_ring;
1151 			cap_rings->mr_gget = mlxcx_mac_fill_rx_group;
1152 			cap_rings->mr_gaddring = NULL;
1153 			cap_rings->mr_gremring = NULL;
1154 			break;
1155 		default:
1156 			return (B_FALSE);
1157 		}
1158 		break;
1159 
1160 	case MAC_CAPAB_HCKSUM:
1161 		if (mlxp->mlx_caps->mlc_checksum) {
1162 			*(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 |
1163 			    HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM;
1164 		}
1165 		break;
1166 
1167 	case MAC_CAPAB_LED:
1168 		cap_leds = cap_data;
1169 
1170 		cap_leds->mcl_flags = 0;
1171 		cap_leds->mcl_modes = MAC_LED_DEFAULT | MAC_LED_OFF |
1172 		    MAC_LED_IDENT;
1173 		cap_leds->mcl_set = mlxcx_mac_led_set;
1174 		break;
1175 
1176 	case MAC_CAPAB_TRANSCEIVER:
1177 		cap_txr = cap_data;
1178 
1179 		cap_txr->mct_flags = 0;
1180 		cap_txr->mct_ntransceivers = 1;
1181 		cap_txr->mct_info = mlxcx_mac_txr_info;
1182 		cap_txr->mct_read = mlxcx_mac_txr_read;
1183 		break;
1184 
1185 	default:
1186 		return (B_FALSE);
1187 	}
1188 
1189 	return (B_TRUE);
1190 }
1191 
1192 static void
1193 mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1194     mac_prop_info_handle_t prh)
1195 {
1196 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1197 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1198 
1199 	mutex_enter(&port->mlp_mtx);
1200 
1201 	switch (pr_num) {
1202 	case MAC_PROP_DUPLEX:
1203 	case MAC_PROP_SPEED:
1204 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1205 		break;
1206 	case MAC_PROP_MTU:
1207 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1208 		mac_prop_info_set_range_uint32(prh, MLXCX_MTU_OFFSET,
1209 		    port->mlp_max_mtu);
1210 		mac_prop_info_set_default_uint32(prh,
1211 		    port->mlp_mtu - MLXCX_MTU_OFFSET);
1212 		break;
1213 	case MAC_PROP_AUTONEG:
1214 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1215 		mac_prop_info_set_default_uint8(prh, 1);
1216 		break;
1217 	case MAC_PROP_ADV_FEC_CAP:
1218 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1219 		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
1220 		break;
1221 	case MAC_PROP_EN_FEC_CAP:
1222 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1223 		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
1224 		break;
1225 	case MAC_PROP_ADV_400GFDX_CAP:
1226 	case MAC_PROP_EN_400GFDX_CAP:
1227 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1228 		mac_prop_info_set_default_uint8(prh,
1229 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_400G) != 0);
1230 		break;
1231 	case MAC_PROP_ADV_200GFDX_CAP:
1232 	case MAC_PROP_EN_200GFDX_CAP:
1233 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1234 		mac_prop_info_set_default_uint8(prh,
1235 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_200G) != 0);
1236 		break;
1237 	case MAC_PROP_ADV_100GFDX_CAP:
1238 	case MAC_PROP_EN_100GFDX_CAP:
1239 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1240 		mac_prop_info_set_default_uint8(prh,
1241 		    ((port->mlp_oper_proto & MLXCX_PROTO_100G) != 0 ||
1242 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_100G)) != 0);
1243 		break;
1244 	case MAC_PROP_ADV_50GFDX_CAP:
1245 	case MAC_PROP_EN_50GFDX_CAP:
1246 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1247 		mac_prop_info_set_default_uint8(prh,
1248 		    ((port->mlp_oper_proto & MLXCX_PROTO_50G) != 0 ||
1249 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_50G)) != 0);
1250 		break;
1251 	case MAC_PROP_ADV_40GFDX_CAP:
1252 	case MAC_PROP_EN_40GFDX_CAP:
1253 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1254 		mac_prop_info_set_default_uint8(prh,
1255 		    ((port->mlp_oper_proto & MLXCX_PROTO_40G) != 0 ||
1256 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_40G)) != 0);
1257 		break;
1258 	case MAC_PROP_ADV_25GFDX_CAP:
1259 	case MAC_PROP_EN_25GFDX_CAP:
1260 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1261 		mac_prop_info_set_default_uint8(prh,
1262 		    ((port->mlp_oper_proto & MLXCX_PROTO_25G) != 0 ||
1263 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_25G)) != 0);
1264 		break;
1265 	case MAC_PROP_ADV_10GFDX_CAP:
1266 	case MAC_PROP_EN_10GFDX_CAP:
1267 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1268 		mac_prop_info_set_default_uint8(prh,
1269 		    ((port->mlp_oper_proto & MLXCX_PROTO_10G) != 0 ||
1270 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_10G)) != 0);
1271 		break;
1272 	case MAC_PROP_ADV_1000FDX_CAP:
1273 	case MAC_PROP_EN_1000FDX_CAP:
1274 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1275 		mac_prop_info_set_default_uint8(prh,
1276 		    ((port->mlp_oper_proto & MLXCX_PROTO_1G) != 0 ||
1277 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_1G)) != 0);
1278 		break;
1279 	case MAC_PROP_ADV_100FDX_CAP:
1280 	case MAC_PROP_EN_100FDX_CAP:
1281 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1282 		mac_prop_info_set_default_uint8(prh,
1283 		    ((port->mlp_oper_proto & MLXCX_PROTO_100M) != 0 ||
1284 		    (port->mlp_ext_oper_proto & MLXCX_EXTPROTO_100M)) != 0);
1285 		break;
1286 	default:
1287 		break;
1288 	}
1289 
1290 	mutex_exit(&port->mlp_mtx);
1291 }
1292 
1293 static int
1294 mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1295     uint_t pr_valsize, const void *pr_val)
1296 {
1297 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1298 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1299 	int ret = 0;
1300 	uint32_t new_mtu, new_hw_mtu, old_mtu;
1301 	mlxcx_buf_shard_t *sh;
1302 	boolean_t allocd = B_FALSE;
1303 	boolean_t relink = B_FALSE;
1304 	link_fec_t fec;
1305 	mlxcx_pplm_fec_caps_t cap_fec;
1306 
1307 	mutex_enter(&port->mlp_mtx);
1308 
1309 	switch (pr_num) {
1310 	case MAC_PROP_MTU:
1311 		bcopy(pr_val, &new_mtu, sizeof (new_mtu));
1312 		new_hw_mtu = new_mtu + MLXCX_MTU_OFFSET;
1313 		if (new_hw_mtu == port->mlp_mtu)
1314 			break;
1315 		if (new_hw_mtu > port->mlp_max_mtu) {
1316 			ret = EINVAL;
1317 			break;
1318 		}
1319 		sh = list_head(&mlxp->mlx_buf_shards);
1320 		for (; sh != NULL; sh = list_next(&mlxp->mlx_buf_shards, sh)) {
1321 			mutex_enter(&sh->mlbs_mtx);
1322 			if (!list_is_empty(&sh->mlbs_free) ||
1323 			    !list_is_empty(&sh->mlbs_busy) ||
1324 			    !list_is_empty(&sh->mlbs_loaned)) {
1325 				allocd = B_TRUE;
1326 				mutex_exit(&sh->mlbs_mtx);
1327 				break;
1328 			}
1329 			mutex_exit(&sh->mlbs_mtx);
1330 		}
1331 		if (allocd) {
1332 			ret = EBUSY;
1333 			break;
1334 		}
1335 		old_mtu = port->mlp_mtu;
1336 		ret = mac_maxsdu_update(mlxp->mlx_mac_hdl, new_mtu);
1337 		if (ret != 0)
1338 			break;
1339 		port->mlp_mtu = new_hw_mtu;
1340 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, port,
1341 		    MLXCX_MODIFY_NIC_VPORT_CTX_MTU)) {
1342 			port->mlp_mtu = old_mtu;
1343 			(void) mac_maxsdu_update(mlxp->mlx_mac_hdl, old_mtu);
1344 			ret = EIO;
1345 			break;
1346 		}
1347 		if (!mlxcx_cmd_set_port_mtu(mlxp, port)) {
1348 			port->mlp_mtu = old_mtu;
1349 			(void) mac_maxsdu_update(mlxp->mlx_mac_hdl, old_mtu);
1350 			ret = EIO;
1351 			break;
1352 		}
1353 		break;
1354 
1355 	case MAC_PROP_EN_FEC_CAP:
1356 		bcopy(pr_val, &fec, sizeof (fec));
1357 		if (!mlxcx_link_fec_cap(fec, &cap_fec)) {
1358 			ret = EINVAL;
1359 			break;
1360 		}
1361 
1362 		/*
1363 		 * Don't change the FEC if it is already at the requested
1364 		 * setting AND the port is up.
1365 		 * When the port is down, always set the FEC and attempt
1366 		 * to retrain the link.
1367 		 */
1368 		if (fec == port->mlp_fec_requested &&
1369 		    fec == mlxcx_fec_to_link_fec(port->mlp_fec_active) &&
1370 		    port->mlp_oper_status != MLXCX_PORT_STATUS_DOWN)
1371 			break;
1372 
1373 		/*
1374 		 * The most like cause of this failing is an invalid
1375 		 * or unsupported fec option.
1376 		 */
1377 		if (!mlxcx_cmd_modify_port_fec(mlxp, port, cap_fec)) {
1378 			ret = EINVAL;
1379 			break;
1380 		}
1381 
1382 		port->mlp_fec_requested = fec;
1383 
1384 		/*
1385 		 * For FEC to become effective, the link needs to go back
1386 		 * to training and negotiation state. This happens when
1387 		 * the link transitions from down to up, force a relink.
1388 		 */
1389 		relink = B_TRUE;
1390 		break;
1391 
1392 	default:
1393 		ret = ENOTSUP;
1394 		break;
1395 	}
1396 
1397 	if (relink) {
1398 		if (!mlxcx_cmd_modify_port_status(mlxp, port,
1399 		    MLXCX_PORT_STATUS_DOWN) ||
1400 		    !mlxcx_cmd_modify_port_status(mlxp, port,
1401 		    MLXCX_PORT_STATUS_UP)) {
1402 			ret = EIO;
1403 		}
1404 	}
1405 	mutex_exit(&port->mlp_mtx);
1406 
1407 	return (ret);
1408 }
1409 
1410 static int
1411 mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1412     uint_t pr_valsize, void *pr_val)
1413 {
1414 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1415 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1416 	uint64_t speed;
1417 	int ret = 0;
1418 
1419 	mutex_enter(&port->mlp_mtx);
1420 
1421 	switch (pr_num) {
1422 	case MAC_PROP_DUPLEX:
1423 		if (pr_valsize < sizeof (link_duplex_t)) {
1424 			ret = EOVERFLOW;
1425 			break;
1426 		}
1427 		/* connectx parts only support full duplex */
1428 		*(link_duplex_t *)pr_val = LINK_DUPLEX_FULL;
1429 		break;
1430 	case MAC_PROP_SPEED:
1431 		if (pr_valsize < sizeof (uint64_t)) {
1432 			ret = EOVERFLOW;
1433 			break;
1434 		}
1435 		speed = mlxcx_speed_to_bits(port->mlp_oper_proto,
1436 		    port->mlp_ext_oper_proto);
1437 		bcopy(&speed, pr_val, sizeof (speed));
1438 		break;
1439 	case MAC_PROP_STATUS:
1440 		if (pr_valsize < sizeof (link_state_t)) {
1441 			ret = EOVERFLOW;
1442 			break;
1443 		}
1444 		switch (port->mlp_oper_status) {
1445 		case MLXCX_PORT_STATUS_UP:
1446 		case MLXCX_PORT_STATUS_UP_ONCE:
1447 			*(link_state_t *)pr_val = LINK_STATE_UP;
1448 			break;
1449 		case MLXCX_PORT_STATUS_DOWN:
1450 			*(link_state_t *)pr_val = LINK_STATE_DOWN;
1451 			break;
1452 		default:
1453 			*(link_state_t *)pr_val = LINK_STATE_UNKNOWN;
1454 		}
1455 		break;
1456 	case MAC_PROP_AUTONEG:
1457 		if (pr_valsize < sizeof (uint8_t)) {
1458 			ret = EOVERFLOW;
1459 			break;
1460 		}
1461 		*(uint8_t *)pr_val = port->mlp_autoneg;
1462 		break;
1463 	case MAC_PROP_ADV_FEC_CAP:
1464 		if (pr_valsize < sizeof (link_fec_t)) {
1465 			ret = EOVERFLOW;
1466 			break;
1467 		}
1468 		*(link_fec_t *)pr_val =
1469 		    mlxcx_fec_to_link_fec(port->mlp_fec_active);
1470 		break;
1471 	case MAC_PROP_EN_FEC_CAP:
1472 		if (pr_valsize < sizeof (link_fec_t)) {
1473 			ret = EOVERFLOW;
1474 			break;
1475 		}
1476 		*(link_fec_t *)pr_val = port->mlp_fec_requested;
1477 		break;
1478 	case MAC_PROP_MTU:
1479 		if (pr_valsize < sizeof (uint32_t)) {
1480 			ret = EOVERFLOW;
1481 			break;
1482 		}
1483 		*(uint32_t *)pr_val = port->mlp_mtu - MLXCX_MTU_OFFSET;
1484 		break;
1485 	case MAC_PROP_ADV_400GFDX_CAP:
1486 	case MAC_PROP_EN_400GFDX_CAP:
1487 		if (pr_valsize < sizeof (uint8_t)) {
1488 			ret = EOVERFLOW;
1489 			break;
1490 		}
1491 		*(uint8_t *)pr_val =
1492 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_400G) != 0;
1493 		break;
1494 	case MAC_PROP_ADV_200GFDX_CAP:
1495 	case MAC_PROP_EN_200GFDX_CAP:
1496 		if (pr_valsize < sizeof (uint8_t)) {
1497 			ret = EOVERFLOW;
1498 			break;
1499 		}
1500 		*(uint8_t *)pr_val =
1501 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_200G) != 0;
1502 		break;
1503 	case MAC_PROP_ADV_100GFDX_CAP:
1504 	case MAC_PROP_EN_100GFDX_CAP:
1505 		if (pr_valsize < sizeof (uint8_t)) {
1506 			ret = EOVERFLOW;
1507 			break;
1508 		}
1509 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1510 		    MLXCX_PROTO_100G) != 0 ||
1511 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_100G) != 0;
1512 		break;
1513 	case MAC_PROP_ADV_50GFDX_CAP:
1514 	case MAC_PROP_EN_50GFDX_CAP:
1515 		if (pr_valsize < sizeof (uint8_t)) {
1516 			ret = EOVERFLOW;
1517 			break;
1518 		}
1519 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1520 		    MLXCX_PROTO_50G) != 0 ||
1521 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_50G) != 0;
1522 		break;
1523 	case MAC_PROP_ADV_40GFDX_CAP:
1524 	case MAC_PROP_EN_40GFDX_CAP:
1525 		if (pr_valsize < sizeof (uint8_t)) {
1526 			ret = EOVERFLOW;
1527 			break;
1528 		}
1529 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1530 		    MLXCX_PROTO_40G) != 0 ||
1531 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_40G) != 0;
1532 		break;
1533 	case MAC_PROP_ADV_25GFDX_CAP:
1534 	case MAC_PROP_EN_25GFDX_CAP:
1535 		if (pr_valsize < sizeof (uint8_t)) {
1536 			ret = EOVERFLOW;
1537 			break;
1538 		}
1539 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1540 		    MLXCX_PROTO_25G) != 0 ||
1541 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_25G) != 0;
1542 		break;
1543 	case MAC_PROP_ADV_10GFDX_CAP:
1544 	case MAC_PROP_EN_10GFDX_CAP:
1545 		if (pr_valsize < sizeof (uint8_t)) {
1546 			ret = EOVERFLOW;
1547 			break;
1548 		}
1549 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1550 		    MLXCX_PROTO_10G) != 0 ||
1551 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_10G) != 0;
1552 		break;
1553 	case MAC_PROP_ADV_1000FDX_CAP:
1554 	case MAC_PROP_EN_1000FDX_CAP:
1555 		if (pr_valsize < sizeof (uint8_t)) {
1556 			ret = EOVERFLOW;
1557 			break;
1558 		}
1559 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1560 		    MLXCX_PROTO_1G) != 0 ||
1561 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_1G) != 0;
1562 		break;
1563 	case MAC_PROP_ADV_100FDX_CAP:
1564 	case MAC_PROP_EN_100FDX_CAP:
1565 		if (pr_valsize < sizeof (uint8_t)) {
1566 			ret = EOVERFLOW;
1567 			break;
1568 		}
1569 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1570 		    MLXCX_PROTO_100M) != 0 ||
1571 		    (port->mlp_ext_max_proto & MLXCX_EXTPROTO_100M) != 0;
1572 		break;
1573 	default:
1574 		ret = ENOTSUP;
1575 		break;
1576 	}
1577 
1578 	mutex_exit(&port->mlp_mtx);
1579 
1580 	return (ret);
1581 }
1582 
1583 #define	MLXCX_MAC_CALLBACK_FLAGS \
1584 	(MC_GETCAPAB | MC_GETPROP | MC_PROPINFO | MC_SETPROP)
1585 
1586 static mac_callbacks_t mlxcx_mac_callbacks = {
1587 	.mc_callbacks = MLXCX_MAC_CALLBACK_FLAGS,
1588 	.mc_getstat = mlxcx_mac_stat,
1589 	.mc_start = mlxcx_mac_start,
1590 	.mc_stop = mlxcx_mac_stop,
1591 	.mc_setpromisc = mlxcx_mac_setpromisc,
1592 	.mc_multicst = mlxcx_mac_multicast,
1593 	.mc_ioctl = NULL,
1594 	.mc_getcapab = mlxcx_mac_getcapab,
1595 	.mc_setprop = mlxcx_mac_setprop,
1596 	.mc_getprop = mlxcx_mac_getprop,
1597 	.mc_propinfo = mlxcx_mac_propinfo,
1598 	.mc_tx = NULL,
1599 	.mc_unicst = NULL,
1600 };
1601 
1602 boolean_t
1603 mlxcx_register_mac(mlxcx_t *mlxp)
1604 {
1605 	mac_register_t *mac = mac_alloc(MAC_VERSION);
1606 	mlxcx_port_t *port;
1607 	int ret;
1608 
1609 	if (mac == NULL)
1610 		return (B_FALSE);
1611 
1612 	VERIFY3U(mlxp->mlx_nports, ==, 1);
1613 	port = &mlxp->mlx_ports[0];
1614 
1615 	mutex_enter(&port->mlp_mtx);
1616 
1617 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1618 	mac->m_driver = mlxp;
1619 	mac->m_dip = mlxp->mlx_dip;
1620 	mac->m_src_addr = port->mlp_mac_address;
1621 	mac->m_callbacks = &mlxcx_mac_callbacks;
1622 	mac->m_min_sdu = MLXCX_MTU_OFFSET;
1623 	mac->m_max_sdu = port->mlp_mtu - MLXCX_MTU_OFFSET;
1624 	mac->m_margin = VLAN_TAGSZ;
1625 	mac->m_priv_props = mlxcx_priv_props;
1626 	mac->m_v12n = MAC_VIRT_LEVEL1;
1627 
1628 	ret = mac_register(mac, &mlxp->mlx_mac_hdl);
1629 	if (ret != 0) {
1630 		mlxcx_warn(mlxp, "mac_register() returned %d", ret);
1631 	}
1632 	mac_free(mac);
1633 
1634 	mutex_exit(&port->mlp_mtx);
1635 
1636 	mlxcx_update_link_state(mlxp, port);
1637 
1638 	return (ret == 0);
1639 }
1640