xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx_gld.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2021, the University of Queensland
14  * Copyright 2020 RackTop Systems, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 #include <sys/modctl.h>
22 #include <sys/conf.h>
23 #include <sys/devops.h>
24 #include <sys/sysmacros.h>
25 #include <sys/vlan.h>
26 
27 #include <sys/pattr.h>
28 #include <sys/dlpi.h>
29 
30 #include <sys/mac_provider.h>
31 
32 /* Need these for mac_vlan_header_info() */
33 #include <sys/mac_client.h>
34 #include <sys/mac_client_priv.h>
35 
36 #include <mlxcx.h>
37 
38 static char *mlxcx_priv_props[] = {
39 	NULL
40 };
41 
42 #define	MBITS		1000000ULL
43 #define	GBITS		(1000ULL * MBITS)
44 
45 static uint64_t
46 mlxcx_speed_to_bits(mlxcx_eth_proto_t v)
47 {
48 	switch (v) {
49 	case MLXCX_PROTO_SGMII_100BASE:
50 		return (100ULL * MBITS);
51 	case MLXCX_PROTO_SGMII:
52 	case MLXCX_PROTO_1000BASE_KX:
53 		return (1000ULL * MBITS);
54 	case MLXCX_PROTO_10GBASE_CX4:
55 	case MLXCX_PROTO_10GBASE_KX4:
56 	case MLXCX_PROTO_10GBASE_KR:
57 	case MLXCX_PROTO_10GBASE_CR:
58 	case MLXCX_PROTO_10GBASE_SR:
59 	case MLXCX_PROTO_10GBASE_ER_LR:
60 		return (10ULL * GBITS);
61 	case MLXCX_PROTO_40GBASE_CR4:
62 	case MLXCX_PROTO_40GBASE_KR4:
63 	case MLXCX_PROTO_40GBASE_SR4:
64 	case MLXCX_PROTO_40GBASE_LR4_ER4:
65 		return (40ULL * GBITS);
66 	case MLXCX_PROTO_25GBASE_CR:
67 	case MLXCX_PROTO_25GBASE_KR:
68 	case MLXCX_PROTO_25GBASE_SR:
69 		return (25ULL * GBITS);
70 	case MLXCX_PROTO_50GBASE_SR2:
71 	case MLXCX_PROTO_50GBASE_CR2:
72 	case MLXCX_PROTO_50GBASE_KR2:
73 		return (50ULL * GBITS);
74 	case MLXCX_PROTO_100GBASE_CR4:
75 	case MLXCX_PROTO_100GBASE_SR4:
76 	case MLXCX_PROTO_100GBASE_KR4:
77 		return (100ULL * GBITS);
78 	default:
79 		return (0);
80 	}
81 }
82 
83 static link_fec_t
84 mlxcx_fec_to_link_fec(mlxcx_pplm_fec_active_t mlxcx_fec)
85 {
86 	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_NONE) != 0)
87 		return (LINK_FEC_NONE);
88 
89 	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_FIRECODE) != 0)
90 		return (LINK_FEC_BASE_R);
91 
92 	if ((mlxcx_fec & (MLXCX_PPLM_FEC_ACTIVE_RS528 |
93 	    MLXCX_PPLM_FEC_ACTIVE_RS271 | MLXCX_PPLM_FEC_ACTIVE_RS544 |
94 	    MLXCX_PPLM_FEC_ACTIVE_RS272)) != 0)
95 		return (LINK_FEC_RS);
96 
97 	return (LINK_FEC_NONE);
98 }
99 
100 static boolean_t
101 mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp)
102 {
103 	mlxcx_pplm_fec_caps_t pplm_fec = 0;
104 
105 	if ((fec & LINK_FEC_AUTO) != 0) {
106 		pplm_fec = MLXCX_PPLM_FEC_CAP_AUTO;
107 		fec &= ~LINK_FEC_AUTO;
108 	} else if ((fec & LINK_FEC_NONE) != 0) {
109 		pplm_fec = MLXCX_PPLM_FEC_CAP_NONE;
110 		fec &= ~LINK_FEC_NONE;
111 	} else if ((fec & LINK_FEC_RS) != 0) {
112 		pplm_fec |= MLXCX_PPLM_FEC_CAP_RS;
113 		fec &= ~LINK_FEC_RS;
114 	} else if ((fec & LINK_FEC_BASE_R) != 0) {
115 		pplm_fec |= MLXCX_PPLM_FEC_CAP_FIRECODE;
116 		fec &= ~LINK_FEC_BASE_R;
117 	}
118 
119 	/*
120 	 * Only one fec option is allowed.
121 	 */
122 	if (fec != 0)
123 		return (B_FALSE);
124 
125 	*pfecp = pplm_fec;
126 
127 	return (B_TRUE);
128 }
129 
130 static int
131 mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat,
132     uint64_t *val)
133 {
134 	int ret = 0;
135 	boolean_t ok;
136 	mlxcx_register_data_t data;
137 	mlxcx_ppcnt_rfc_2863_t *st;
138 
139 	ASSERT(mutex_owned(&port->mlp_mtx));
140 
141 	bzero(&data, sizeof (data));
142 	data.mlrd_ppcnt.mlrd_ppcnt_local_port = port->mlp_num + 1;
143 	data.mlrd_ppcnt.mlrd_ppcnt_grp = MLXCX_PPCNT_GRP_RFC_2863;
144 	data.mlrd_ppcnt.mlrd_ppcnt_clear = MLXCX_PPCNT_NO_CLEAR;
145 
146 	ok = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
147 	    MLXCX_REG_PPCNT, &data);
148 	if (!ok)
149 		return (EIO);
150 	st = &data.mlrd_ppcnt.mlrd_ppcnt_rfc_2863;
151 
152 	switch (stat) {
153 	case MAC_STAT_RBYTES:
154 		*val = from_be64(st->mlppc_rfc_2863_in_octets);
155 		break;
156 	case MAC_STAT_MULTIRCV:
157 		*val = from_be64(st->mlppc_rfc_2863_in_mcast_pkts);
158 		break;
159 	case MAC_STAT_BRDCSTRCV:
160 		*val = from_be64(st->mlppc_rfc_2863_in_bcast_pkts);
161 		break;
162 	case MAC_STAT_MULTIXMT:
163 		*val = from_be64(st->mlppc_rfc_2863_out_mcast_pkts);
164 		break;
165 	case MAC_STAT_BRDCSTXMT:
166 		*val = from_be64(st->mlppc_rfc_2863_out_bcast_pkts);
167 		break;
168 	case MAC_STAT_IERRORS:
169 		*val = from_be64(st->mlppc_rfc_2863_in_errors);
170 		break;
171 	case MAC_STAT_UNKNOWNS:
172 		*val = from_be64(st->mlppc_rfc_2863_in_unknown_protos);
173 		break;
174 	case MAC_STAT_OERRORS:
175 		*val = from_be64(st->mlppc_rfc_2863_out_errors);
176 		break;
177 	case MAC_STAT_OBYTES:
178 		*val = from_be64(st->mlppc_rfc_2863_out_octets);
179 		break;
180 	default:
181 		ret = ENOTSUP;
182 	}
183 
184 	return (ret);
185 }
186 
187 static int
188 mlxcx_mac_stat_ieee_802_3(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat,
189     uint64_t *val)
190 {
191 	int ret = 0;
192 	boolean_t ok;
193 	mlxcx_register_data_t data;
194 	mlxcx_ppcnt_ieee_802_3_t *st;
195 
196 	ASSERT(mutex_owned(&port->mlp_mtx));
197 
198 	bzero(&data, sizeof (data));
199 	data.mlrd_ppcnt.mlrd_ppcnt_local_port = port->mlp_num + 1;
200 	data.mlrd_ppcnt.mlrd_ppcnt_grp = MLXCX_PPCNT_GRP_IEEE_802_3;
201 	data.mlrd_ppcnt.mlrd_ppcnt_clear = MLXCX_PPCNT_NO_CLEAR;
202 
203 	ok = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
204 	    MLXCX_REG_PPCNT, &data);
205 	if (!ok)
206 		return (EIO);
207 	st = &data.mlrd_ppcnt.mlrd_ppcnt_ieee_802_3;
208 
209 	switch (stat) {
210 	case MAC_STAT_IPACKETS:
211 		*val = from_be64(st->mlppc_ieee_802_3_frames_rx);
212 		break;
213 	case MAC_STAT_OPACKETS:
214 		*val = from_be64(st->mlppc_ieee_802_3_frames_tx);
215 		break;
216 	case ETHER_STAT_ALIGN_ERRORS:
217 		*val = from_be64(st->mlppc_ieee_802_3_align_err);
218 		break;
219 	case ETHER_STAT_FCS_ERRORS:
220 		*val = from_be64(st->mlppc_ieee_802_3_fcs_err);
221 		break;
222 	case ETHER_STAT_TOOLONG_ERRORS:
223 		*val = from_be64(st->mlppc_ieee_802_3_frame_too_long_err);
224 		break;
225 	default:
226 		ret = ENOTSUP;
227 	}
228 
229 	return (ret);
230 }
231 
232 static int
233 mlxcx_mac_stat(void *arg, uint_t stat, uint64_t *val)
234 {
235 	mlxcx_t *mlxp = (mlxcx_t *)arg;
236 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
237 	int ret = 0;
238 
239 	mutex_enter(&port->mlp_mtx);
240 
241 	switch (stat) {
242 	case MAC_STAT_IFSPEED:
243 		*val = mlxcx_speed_to_bits(port->mlp_oper_proto);
244 		break;
245 	case ETHER_STAT_LINK_DUPLEX:
246 		*val = LINK_DUPLEX_FULL;
247 		break;
248 	case MAC_STAT_RBYTES:
249 	case MAC_STAT_MULTIRCV:
250 	case MAC_STAT_BRDCSTRCV:
251 	case MAC_STAT_MULTIXMT:
252 	case MAC_STAT_BRDCSTXMT:
253 	case MAC_STAT_IERRORS:
254 	case MAC_STAT_UNKNOWNS:
255 	case MAC_STAT_OERRORS:
256 	case MAC_STAT_OBYTES:
257 		ret = mlxcx_mac_stat_rfc_2863(mlxp, port, stat, val);
258 		break;
259 	case MAC_STAT_IPACKETS:
260 	case MAC_STAT_OPACKETS:
261 	case ETHER_STAT_ALIGN_ERRORS:
262 	case ETHER_STAT_FCS_ERRORS:
263 	case ETHER_STAT_TOOLONG_ERRORS:
264 		ret = mlxcx_mac_stat_ieee_802_3(mlxp, port, stat, val);
265 		break;
266 	case MAC_STAT_NORCVBUF:
267 		*val = port->mlp_stats.mlps_rx_drops;
268 		break;
269 	default:
270 		ret = ENOTSUP;
271 	}
272 
273 	mutex_exit(&port->mlp_mtx);
274 
275 	return (ret);
276 }
277 
278 static int
279 mlxcx_mac_led_set(void *arg, mac_led_mode_t mode, uint_t flags)
280 {
281 	mlxcx_t *mlxp = arg;
282 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
283 	int ret = 0;
284 
285 	if (flags != 0) {
286 		return (EINVAL);
287 	}
288 
289 	mutex_enter(&port->mlp_mtx);
290 
291 	switch (mode) {
292 	case MAC_LED_DEFAULT:
293 	case MAC_LED_OFF:
294 		if (!mlxcx_cmd_set_port_led(mlxp, port, 0)) {
295 			ret = EIO;
296 			break;
297 		}
298 		break;
299 	case MAC_LED_IDENT:
300 		if (!mlxcx_cmd_set_port_led(mlxp, port, UINT16_MAX)) {
301 			ret = EIO;
302 			break;
303 		}
304 		break;
305 	default:
306 		ret = ENOTSUP;
307 	}
308 
309 	mutex_exit(&port->mlp_mtx);
310 
311 	return (ret);
312 }
313 
314 static int
315 mlxcx_mac_txr_info(void *arg, uint_t id, mac_transceiver_info_t *infop)
316 {
317 	mlxcx_t *mlxp = arg;
318 	mlxcx_module_status_t st;
319 
320 	if (!mlxcx_cmd_query_module_status(mlxp, id, &st, NULL))
321 		return (EIO);
322 
323 	if (st != MLXCX_MODULE_UNPLUGGED)
324 		mac_transceiver_info_set_present(infop, B_TRUE);
325 
326 	if (st == MLXCX_MODULE_PLUGGED)
327 		mac_transceiver_info_set_usable(infop, B_TRUE);
328 
329 	return (0);
330 }
331 
332 static int
333 mlxcx_mac_txr_read(void *arg, uint_t id, uint_t page, void *vbuf,
334     size_t nbytes, off_t offset, size_t *nread)
335 {
336 	mlxcx_t *mlxp = arg;
337 	mlxcx_register_data_t data;
338 	uint8_t *buf = vbuf;
339 	boolean_t ok;
340 	size_t take, done = 0;
341 	uint8_t i2c_addr;
342 
343 	if (id != 0 || vbuf == NULL || nbytes == 0 || nread == NULL)
344 		return (EINVAL);
345 
346 	if (nbytes > 256 || offset >= 256 || (offset + nbytes > 256))
347 		return (EINVAL);
348 
349 	/*
350 	 * The PRM is really not very clear about any of this, but it seems
351 	 * that the i2c_device_addr field in MCIA is the SFP+ spec "page"
352 	 * number shifted right by 1 bit. They're written in the SFF spec
353 	 * like "1010000X" so Mellanox just dropped the X.
354 	 *
355 	 * This means that if we want page 0xA0, we put 0x50 in the
356 	 * i2c_device_addr field.
357 	 *
358 	 * The "page_number" field in MCIA means something else. Don't ask me
359 	 * what. FreeBSD leaves it as zero, so we will too!
360 	 */
361 	i2c_addr = page >> 1;
362 
363 	while (done < nbytes) {
364 		take = nbytes - done;
365 		if (take > sizeof (data.mlrd_mcia.mlrd_mcia_data))
366 			take = sizeof (data.mlrd_mcia.mlrd_mcia_data);
367 
368 		bzero(&data, sizeof (data));
369 		ASSERT3U(id, <=, 0xff);
370 		data.mlrd_mcia.mlrd_mcia_module = (uint8_t)id;
371 		data.mlrd_mcia.mlrd_mcia_i2c_device_addr = i2c_addr;
372 		data.mlrd_mcia.mlrd_mcia_device_addr = to_be16(offset);
373 		data.mlrd_mcia.mlrd_mcia_size = to_be16(take);
374 
375 		ok = mlxcx_cmd_access_register(mlxp,
376 		    MLXCX_CMD_ACCESS_REGISTER_READ, MLXCX_REG_MCIA, &data);
377 		if (!ok) {
378 			*nread = 0;
379 			return (EIO);
380 		}
381 
382 		if (data.mlrd_mcia.mlrd_mcia_status != MLXCX_MCIA_STATUS_OK) {
383 			*nread = 0;
384 			return (EIO);
385 		}
386 
387 		bcopy(data.mlrd_mcia.mlrd_mcia_data, &buf[done], take);
388 
389 		done += take;
390 		offset += take;
391 	}
392 	*nread = done;
393 	return (0);
394 }
395 
396 static int
397 mlxcx_mac_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
398 {
399 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
400 	(void) wq;
401 
402 	/*
403 	 * We should add support for using hw flow counters and such to
404 	 * get per-ring statistics. Not done yet though!
405 	 */
406 
407 	switch (stat) {
408 	default:
409 		*val = 0;
410 		return (ENOTSUP);
411 	}
412 
413 	return (0);
414 }
415 
416 static int
417 mlxcx_mac_start(void *arg)
418 {
419 	mlxcx_t *mlxp = (mlxcx_t *)arg;
420 	(void) mlxp;
421 	return (0);
422 }
423 
424 static void
425 mlxcx_mac_stop(void *arg)
426 {
427 	mlxcx_t *mlxp = (mlxcx_t *)arg;
428 	(void) mlxp;
429 }
430 
431 static mblk_t *
432 mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
433 {
434 	mlxcx_work_queue_t *sq = (mlxcx_work_queue_t *)arg;
435 	mlxcx_t *mlxp = sq->mlwq_mlx;
436 	mlxcx_completion_queue_t *cq;
437 	mlxcx_buffer_t *b;
438 	mac_header_info_t mhi;
439 	mblk_t *kmp, *nmp;
440 	uint8_t inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN];
441 	size_t inline_hdrlen, rem, off;
442 	uint32_t chkflags = 0;
443 	boolean_t ok;
444 	size_t take = 0;
445 	uint_t bcount;
446 
447 	VERIFY(mp->b_next == NULL);
448 
449 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &chkflags);
450 
451 	if (mac_vlan_header_info(mlxp->mlx_mac_hdl, mp, &mhi) != 0) {
452 		/*
453 		 * We got given a frame without a valid L2 header on it. We
454 		 * can't really transmit that (mlx parts don't like it), so
455 		 * we will just drop it on the floor.
456 		 */
457 		freemsg(mp);
458 		return (NULL);
459 	}
460 
461 	inline_hdrlen = rem = mhi.mhi_hdrsize;
462 
463 	kmp = mp;
464 	off = 0;
465 	while (rem > 0) {
466 		const ptrdiff_t sz = MBLKL(kmp);
467 		ASSERT3S(sz, >=, 0);
468 		ASSERT3U(sz, <=, SIZE_MAX);
469 		take = sz;
470 		if (take > rem)
471 			take = rem;
472 		bcopy(kmp->b_rptr, inline_hdrs + off, take);
473 		rem -= take;
474 		off += take;
475 		if (take == sz) {
476 			take = 0;
477 			kmp = kmp->b_cont;
478 		}
479 	}
480 
481 	bcount = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b);
482 	if (bcount == 0) {
483 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
484 		return (mp);
485 	}
486 
487 	mutex_enter(&sq->mlwq_mtx);
488 	VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2);
489 	cq = sq->mlwq_cq;
490 
491 	/*
492 	 * state is a single int, so read-only access without the CQ lock
493 	 * should be fine.
494 	 */
495 	if (cq->mlcq_state & MLXCX_CQ_TEARDOWN) {
496 		mutex_exit(&sq->mlwq_mtx);
497 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
498 		return (NULL);
499 	}
500 
501 	if ((sq->mlwq_state & (MLXCX_WQ_TEARDOWN | MLXCX_WQ_STARTED)) !=
502 	    MLXCX_WQ_STARTED) {
503 		mutex_exit(&sq->mlwq_mtx);
504 		mlxcx_buf_return_chain(mlxp, b, B_FALSE);
505 		return (NULL);
506 	}
507 
508 	/*
509 	 * If the completion queue buffer count is already at or above
510 	 * the high water mark, or the addition of this new chain will
511 	 * exceed the CQ ring size, then indicate we are blocked.
512 	 */
513 	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm ||
514 	    (cq->mlcq_bufcnt + bcount) > cq->mlcq_nents) {
515 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
516 		goto blocked;
517 	}
518 
519 	if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
520 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
521 		goto blocked;
522 	}
523 
524 	ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
525 	    chkflags, b);
526 	if (!ok) {
527 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
528 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
529 		goto blocked;
530 	}
531 
532 	/*
533 	 * Now that we've successfully enqueued the rest of the packet,
534 	 * free any mblks that we cut off while inlining headers.
535 	 */
536 	for (; mp != kmp; mp = nmp) {
537 		nmp = mp->b_cont;
538 		freeb(mp);
539 	}
540 
541 	mutex_exit(&sq->mlwq_mtx);
542 
543 	return (NULL);
544 
545 blocked:
546 	mutex_exit(&sq->mlwq_mtx);
547 	mlxcx_buf_return_chain(mlxp, b, B_TRUE);
548 	return (mp);
549 }
550 
551 static int
552 mlxcx_mac_setpromisc(void *arg, boolean_t on)
553 {
554 	mlxcx_t *mlxp = (mlxcx_t *)arg;
555 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
556 	mlxcx_flow_group_t *fg;
557 	mlxcx_flow_entry_t *fe;
558 	mlxcx_flow_table_t *ft;
559 	mlxcx_ring_group_t *g;
560 	int ret = 0;
561 	uint_t idx;
562 
563 	mutex_enter(&port->mlp_mtx);
564 
565 	/*
566 	 * First, do the top-level flow entry on the root flow table for
567 	 * the port. This catches all traffic that doesn't match any MAC
568 	 * MAC filters.
569 	 */
570 	ft = port->mlp_rx_flow;
571 	mutex_enter(&ft->mlft_mtx);
572 	fg = port->mlp_promisc;
573 	fe = list_head(&fg->mlfg_entries);
574 	if (on && !(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
575 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
576 			ret = EIO;
577 		}
578 	} else if (!on && (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
579 		if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
580 			ret = EIO;
581 		}
582 	}
583 	mutex_exit(&ft->mlft_mtx);
584 
585 	/*
586 	 * If we failed to change the top-level entry, don't bother with
587 	 * trying the per-group ones.
588 	 */
589 	if (ret != 0) {
590 		mutex_exit(&port->mlp_mtx);
591 		return (ret);
592 	}
593 
594 	/*
595 	 * Then, do the per-rx-group flow entries which catch traffic that
596 	 * matched a MAC filter but failed to match a VLAN filter.
597 	 */
598 	for (idx = 0; idx < mlxp->mlx_rx_ngroups; ++idx) {
599 		g = &mlxp->mlx_rx_groups[idx];
600 
601 		mutex_enter(&g->mlg_mtx);
602 
603 		ft = g->mlg_rx_vlan_ft;
604 		mutex_enter(&ft->mlft_mtx);
605 
606 		fg = g->mlg_rx_vlan_promisc_fg;
607 		fe = list_head(&fg->mlfg_entries);
608 		if (on && !(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
609 			if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
610 				ret = EIO;
611 			}
612 		} else if (!on && (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED)) {
613 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
614 				ret = EIO;
615 			}
616 		}
617 
618 		mutex_exit(&ft->mlft_mtx);
619 		mutex_exit(&g->mlg_mtx);
620 	}
621 
622 	mutex_exit(&port->mlp_mtx);
623 	return (ret);
624 }
625 
626 static int
627 mlxcx_mac_multicast(void *arg, boolean_t add, const uint8_t *addr)
628 {
629 	mlxcx_t *mlxp = (mlxcx_t *)arg;
630 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
631 	mlxcx_ring_group_t *g = &mlxp->mlx_rx_groups[0];
632 	int ret = 0;
633 
634 	mutex_enter(&port->mlp_mtx);
635 	mutex_enter(&g->mlg_mtx);
636 	if (add) {
637 		if (!mlxcx_add_umcast_entry(mlxp, port, g, addr)) {
638 			ret = EIO;
639 		}
640 	} else {
641 		if (!mlxcx_remove_umcast_entry(mlxp, port, g, addr)) {
642 			ret = EIO;
643 		}
644 	}
645 	mutex_exit(&g->mlg_mtx);
646 	mutex_exit(&port->mlp_mtx);
647 	return (ret);
648 }
649 
650 static int
651 mlxcx_group_add_mac(void *arg, const uint8_t *mac_addr)
652 {
653 	mlxcx_ring_group_t *g = arg;
654 	mlxcx_t *mlxp = g->mlg_mlx;
655 	mlxcx_port_t *port = g->mlg_port;
656 	int ret = 0;
657 
658 	mutex_enter(&port->mlp_mtx);
659 	mutex_enter(&g->mlg_mtx);
660 	if (!mlxcx_add_umcast_entry(mlxp, port, g, mac_addr)) {
661 		ret = EIO;
662 	}
663 	mutex_exit(&g->mlg_mtx);
664 	mutex_exit(&port->mlp_mtx);
665 
666 	return (ret);
667 }
668 
669 static int
670 mlxcx_group_add_vlan(mac_group_driver_t gh, uint16_t vid)
671 {
672 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
673 	mlxcx_t *mlxp = g->mlg_mlx;
674 	int ret = 0;
675 	boolean_t tagged = B_TRUE;
676 
677 	if (vid == MAC_VLAN_UNTAGGED) {
678 		vid = 0;
679 		tagged = B_FALSE;
680 	}
681 
682 	mutex_enter(&g->mlg_mtx);
683 	if (!mlxcx_add_vlan_entry(mlxp, g, tagged, vid)) {
684 		ret = EIO;
685 	}
686 	mutex_exit(&g->mlg_mtx);
687 
688 	return (ret);
689 }
690 
691 static int
692 mlxcx_group_remove_vlan(mac_group_driver_t gh, uint16_t vid)
693 {
694 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
695 	mlxcx_t *mlxp = g->mlg_mlx;
696 	int ret = 0;
697 	boolean_t tagged = B_TRUE;
698 
699 	if (vid == MAC_VLAN_UNTAGGED) {
700 		vid = 0;
701 		tagged = B_FALSE;
702 	}
703 
704 	mutex_enter(&g->mlg_mtx);
705 	if (!mlxcx_remove_vlan_entry(mlxp, g, tagged, vid)) {
706 		ret = EIO;
707 	}
708 	mutex_exit(&g->mlg_mtx);
709 
710 	return (ret);
711 }
712 
713 static int
714 mlxcx_group_remove_mac(void *arg, const uint8_t *mac_addr)
715 {
716 	mlxcx_ring_group_t *g = arg;
717 	mlxcx_t *mlxp = g->mlg_mlx;
718 	mlxcx_port_t *port = g->mlg_port;
719 	int ret = 0;
720 
721 	mutex_enter(&port->mlp_mtx);
722 	mutex_enter(&g->mlg_mtx);
723 	if (!mlxcx_remove_umcast_entry(mlxp, port, g, mac_addr)) {
724 		ret = EIO;
725 	}
726 	mutex_exit(&g->mlg_mtx);
727 	mutex_exit(&port->mlp_mtx);
728 
729 	return (ret);
730 }
731 
732 static int
733 mlxcx_mac_ring_start(mac_ring_driver_t rh, uint64_t gen_num)
734 {
735 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
736 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
737 	mlxcx_ring_group_t *g = wq->mlwq_group;
738 	mlxcx_t *mlxp = wq->mlwq_mlx;
739 
740 	ASSERT(cq != NULL);
741 	ASSERT(g != NULL);
742 
743 	ASSERT(wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ ||
744 	    wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ);
745 	if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
746 	    !mlxcx_tx_ring_start(mlxp, g, wq))
747 		return (EIO);
748 	if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
749 	    !mlxcx_rx_ring_start(mlxp, g, wq))
750 		return (EIO);
751 
752 	mutex_enter(&cq->mlcq_mtx);
753 	cq->mlcq_mac_gen = gen_num;
754 	mutex_exit(&cq->mlcq_mtx);
755 
756 	return (0);
757 }
758 
759 static void
760 mlxcx_mac_ring_stop(mac_ring_driver_t rh)
761 {
762 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)rh;
763 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
764 	mlxcx_t *mlxp = wq->mlwq_mlx;
765 	mlxcx_buf_shard_t *s;
766 	mlxcx_buffer_t *buf;
767 
768 	/*
769 	 * To prevent deadlocks and sleeping whilst holding either the
770 	 * CQ mutex or WQ mutex, we split the stop processing into two
771 	 * parts.
772 	 *
773 	 * With the CQ amd WQ mutexes held the appropriate WQ is stopped.
774 	 * The Q in the HCA is set to Reset state and flagged as no
775 	 * longer started. Atomic with changing this WQ state, the buffer
776 	 * shards are flagged as draining.
777 	 *
778 	 * Now, any requests for buffers and attempts to submit messages
779 	 * will fail and once we're in this state it is safe to relinquish
780 	 * the CQ and WQ mutexes. Allowing us to complete the ring stop
781 	 * by waiting for the buffer lists, with the exception of
782 	 * the loaned list, to drain. Buffers on the loaned list are
783 	 * not under our control, we will get them back when the mblk tied
784 	 * to the buffer is freed.
785 	 */
786 
787 	mutex_enter(&cq->mlcq_mtx);
788 	mutex_enter(&wq->mlwq_mtx);
789 
790 	if (wq->mlwq_state & MLXCX_WQ_STARTED) {
791 		if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ &&
792 		    !mlxcx_cmd_stop_rq(mlxp, wq)) {
793 			mutex_exit(&wq->mlwq_mtx);
794 			mutex_exit(&cq->mlcq_mtx);
795 			return;
796 		}
797 		if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ &&
798 		    !mlxcx_cmd_stop_sq(mlxp, wq)) {
799 			mutex_exit(&wq->mlwq_mtx);
800 			mutex_exit(&cq->mlcq_mtx);
801 			return;
802 		}
803 	}
804 	ASSERT0(wq->mlwq_state & MLXCX_WQ_STARTED);
805 
806 	mlxcx_shard_draining(wq->mlwq_bufs);
807 	if (wq->mlwq_foreign_bufs != NULL)
808 		mlxcx_shard_draining(wq->mlwq_foreign_bufs);
809 
810 
811 	if (wq->mlwq_state & MLXCX_WQ_BUFFERS) {
812 		list_t cq_buffers;
813 
814 		/*
815 		 * Take the buffers away from the CQ. If the CQ is being
816 		 * processed and the WQ has been stopped, a completion
817 		 * which does not match to a buffer will be ignored.
818 		 */
819 		list_create(&cq_buffers, sizeof (mlxcx_buffer_t),
820 		    offsetof(mlxcx_buffer_t, mlb_cq_entry));
821 
822 		list_move_tail(&cq_buffers, &cq->mlcq_buffers);
823 
824 		mutex_enter(&cq->mlcq_bufbmtx);
825 		list_move_tail(&cq_buffers, &cq->mlcq_buffers_b);
826 		mutex_exit(&cq->mlcq_bufbmtx);
827 
828 		cq->mlcq_bufcnt = 0;
829 
830 		mutex_exit(&wq->mlwq_mtx);
831 		mutex_exit(&cq->mlcq_mtx);
832 
833 		/* Return any outstanding buffers to the free pool. */
834 		while ((buf = list_remove_head(&cq_buffers)) != NULL) {
835 			mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
836 		}
837 		list_destroy(&cq_buffers);
838 
839 		s = wq->mlwq_bufs;
840 		mutex_enter(&s->mlbs_mtx);
841 		while (!list_is_empty(&s->mlbs_busy))
842 			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
843 		while ((buf = list_head(&s->mlbs_free)) != NULL) {
844 			mlxcx_buf_destroy(mlxp, buf);
845 		}
846 		mutex_exit(&s->mlbs_mtx);
847 
848 		s = wq->mlwq_foreign_bufs;
849 		if (s != NULL) {
850 			mutex_enter(&s->mlbs_mtx);
851 			while (!list_is_empty(&s->mlbs_busy))
852 				cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
853 			while ((buf = list_head(&s->mlbs_free)) != NULL) {
854 				mlxcx_buf_destroy(mlxp, buf);
855 			}
856 			mutex_exit(&s->mlbs_mtx);
857 		}
858 
859 		mutex_enter(&wq->mlwq_mtx);
860 		wq->mlwq_state &= ~MLXCX_WQ_BUFFERS;
861 		mutex_exit(&wq->mlwq_mtx);
862 	} else {
863 		mutex_exit(&wq->mlwq_mtx);
864 		mutex_exit(&cq->mlcq_mtx);
865 	}
866 }
867 
868 static int
869 mlxcx_mac_group_start(mac_group_driver_t gh)
870 {
871 	mlxcx_ring_group_t *g = (mlxcx_ring_group_t *)gh;
872 	mlxcx_t *mlxp = g->mlg_mlx;
873 
874 	VERIFY3S(g->mlg_type, ==, MLXCX_GROUP_RX);
875 	ASSERT(mlxp != NULL);
876 
877 	if (g->mlg_state & MLXCX_GROUP_RUNNING)
878 		return (0);
879 
880 	if (!mlxcx_rx_group_start(mlxp, g))
881 		return (EIO);
882 
883 	return (0);
884 }
885 
886 static void
887 mlxcx_mac_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
888     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
889 {
890 	mlxcx_t *mlxp = (mlxcx_t *)arg;
891 	mlxcx_ring_group_t *g;
892 	mlxcx_work_queue_t *wq;
893 	mac_intr_t *mintr = &infop->mri_intr;
894 
895 	if (rtype != MAC_RING_TYPE_TX)
896 		return;
897 	ASSERT3S(group_index, ==, -1);
898 
899 	g = &mlxp->mlx_tx_groups[0];
900 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
901 	mutex_enter(&g->mlg_mtx);
902 
903 	ASSERT3S(ring_index, >=, 0);
904 	ASSERT3S(ring_index, <, g->mlg_nwqs);
905 
906 	wq = &g->mlg_wqs[ring_index];
907 
908 	wq->mlwq_cq->mlcq_mac_hdl = rh;
909 
910 	infop->mri_driver = (mac_ring_driver_t)wq;
911 	infop->mri_start = mlxcx_mac_ring_start;
912 	infop->mri_stop = mlxcx_mac_ring_stop;
913 	infop->mri_tx = mlxcx_mac_ring_tx;
914 	infop->mri_stat = mlxcx_mac_ring_stat;
915 
916 	mintr->mi_ddi_handle = mlxp->mlx_intr_handles[
917 	    wq->mlwq_cq->mlcq_eq->mleq_intr_index];
918 
919 	mutex_exit(&g->mlg_mtx);
920 }
921 
922 static int
923 mlxcx_mac_ring_intr_enable(mac_intr_handle_t intrh)
924 {
925 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
926 	mlxcx_t *mlxp = cq->mlcq_mlx;
927 
928 	/*
929 	 * We are going to call mlxcx_arm_cq() here, so we take the arm lock
930 	 * as well as the CQ one to make sure we don't race against
931 	 * mlxcx_intr_n().
932 	 */
933 	mutex_enter(&cq->mlcq_arm_mtx);
934 	mutex_enter(&cq->mlcq_mtx);
935 	if (cq->mlcq_state & MLXCX_CQ_POLLING) {
936 		atomic_and_uint(&cq->mlcq_state, ~MLXCX_CQ_POLLING);
937 		if (!(cq->mlcq_state & MLXCX_CQ_ARMED))
938 			mlxcx_arm_cq(mlxp, cq);
939 	}
940 	mutex_exit(&cq->mlcq_mtx);
941 	mutex_exit(&cq->mlcq_arm_mtx);
942 
943 	return (0);
944 }
945 
946 static int
947 mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
948 {
949 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
950 
951 	mutex_enter(&cq->mlcq_mtx);
952 	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
953 	mutex_exit(&cq->mlcq_mtx);
954 
955 	return (0);
956 }
957 
958 static mblk_t *
959 mlxcx_mac_ring_rx_poll(void *arg, int poll_bytes)
960 {
961 	mlxcx_work_queue_t *wq = (mlxcx_work_queue_t *)arg;
962 	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
963 	mlxcx_t *mlxp = wq->mlwq_mlx;
964 	mblk_t *mp;
965 
966 	ASSERT(cq != NULL);
967 	ASSERT3S(poll_bytes, >, 0);
968 	if (poll_bytes == 0)
969 		return (NULL);
970 
971 	mutex_enter(&cq->mlcq_mtx);
972 	mp = mlxcx_rx_poll(mlxp, cq, poll_bytes);
973 	mutex_exit(&cq->mlcq_mtx);
974 
975 	return (mp);
976 }
977 
978 static void
979 mlxcx_mac_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
980     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
981 {
982 	mlxcx_t *mlxp = (mlxcx_t *)arg;
983 	mlxcx_ring_group_t *g;
984 	mlxcx_work_queue_t *wq;
985 	mac_intr_t *mintr = &infop->mri_intr;
986 
987 	if (rtype != MAC_RING_TYPE_RX)
988 		return;
989 	ASSERT3S(group_index, >=, 0);
990 	ASSERT3S(group_index, <, mlxp->mlx_rx_ngroups);
991 
992 	g = &mlxp->mlx_rx_groups[group_index];
993 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
994 	mutex_enter(&g->mlg_mtx);
995 
996 	ASSERT3S(ring_index, >=, 0);
997 	ASSERT3S(ring_index, <, g->mlg_nwqs);
998 
999 	ASSERT(g->mlg_state & MLXCX_GROUP_WQS);
1000 	wq = &g->mlg_wqs[ring_index];
1001 
1002 	wq->mlwq_cq->mlcq_mac_hdl = rh;
1003 
1004 	infop->mri_driver = (mac_ring_driver_t)wq;
1005 	infop->mri_start = mlxcx_mac_ring_start;
1006 	infop->mri_stop = mlxcx_mac_ring_stop;
1007 	infop->mri_poll = mlxcx_mac_ring_rx_poll;
1008 	infop->mri_stat = mlxcx_mac_ring_stat;
1009 
1010 	mintr->mi_handle = (mac_intr_handle_t)wq->mlwq_cq;
1011 	mintr->mi_enable = mlxcx_mac_ring_intr_enable;
1012 	mintr->mi_disable = mlxcx_mac_ring_intr_disable;
1013 
1014 	mintr->mi_ddi_handle = mlxp->mlx_intr_handles[
1015 	    wq->mlwq_cq->mlcq_eq->mleq_intr_index];
1016 
1017 	mutex_exit(&g->mlg_mtx);
1018 }
1019 
1020 static void
1021 mlxcx_mac_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
1022     mac_group_info_t *infop, mac_group_handle_t gh)
1023 {
1024 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1025 	mlxcx_ring_group_t *g;
1026 
1027 	if (rtype != MAC_RING_TYPE_RX)
1028 		return;
1029 
1030 	ASSERT3S(index, >=, 0);
1031 	ASSERT3S(index, <, mlxp->mlx_rx_ngroups);
1032 	g = &mlxp->mlx_rx_groups[index];
1033 	ASSERT(g->mlg_state & MLXCX_GROUP_INIT);
1034 
1035 	g->mlg_mac_hdl = gh;
1036 
1037 	infop->mgi_driver = (mac_group_driver_t)g;
1038 	infop->mgi_start = mlxcx_mac_group_start;
1039 	infop->mgi_stop = NULL;
1040 	infop->mgi_addmac = mlxcx_group_add_mac;
1041 	infop->mgi_remmac = mlxcx_group_remove_mac;
1042 	infop->mgi_addvlan = mlxcx_group_add_vlan;
1043 	infop->mgi_remvlan = mlxcx_group_remove_vlan;
1044 
1045 	infop->mgi_count = g->mlg_nwqs;
1046 }
1047 
1048 static boolean_t
1049 mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1050 {
1051 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1052 	mac_capab_rings_t *cap_rings;
1053 	mac_capab_led_t *cap_leds;
1054 	mac_capab_transceiver_t *cap_txr;
1055 	uint_t i, n = 0;
1056 
1057 	switch (cap) {
1058 
1059 	case MAC_CAPAB_RINGS:
1060 		cap_rings = cap_data;
1061 		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
1062 		switch (cap_rings->mr_type) {
1063 		case MAC_RING_TYPE_TX:
1064 			cap_rings->mr_gnum = 0;
1065 			cap_rings->mr_rnum = mlxp->mlx_tx_groups[0].mlg_nwqs;
1066 			cap_rings->mr_rget = mlxcx_mac_fill_tx_ring;
1067 			cap_rings->mr_gget = NULL;
1068 			cap_rings->mr_gaddring = NULL;
1069 			cap_rings->mr_gremring = NULL;
1070 			break;
1071 		case MAC_RING_TYPE_RX:
1072 			cap_rings->mr_gnum = mlxp->mlx_rx_ngroups;
1073 			for (i = 0; i < mlxp->mlx_rx_ngroups; ++i)
1074 				n += mlxp->mlx_rx_groups[i].mlg_nwqs;
1075 			cap_rings->mr_rnum = n;
1076 			cap_rings->mr_rget = mlxcx_mac_fill_rx_ring;
1077 			cap_rings->mr_gget = mlxcx_mac_fill_rx_group;
1078 			cap_rings->mr_gaddring = NULL;
1079 			cap_rings->mr_gremring = NULL;
1080 			break;
1081 		default:
1082 			return (B_FALSE);
1083 		}
1084 		break;
1085 
1086 	case MAC_CAPAB_HCKSUM:
1087 		if (mlxp->mlx_caps->mlc_checksum) {
1088 			*(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 |
1089 			    HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM;
1090 		}
1091 		break;
1092 
1093 	case MAC_CAPAB_LED:
1094 		cap_leds = cap_data;
1095 
1096 		cap_leds->mcl_flags = 0;
1097 		cap_leds->mcl_modes = MAC_LED_DEFAULT | MAC_LED_OFF |
1098 		    MAC_LED_IDENT;
1099 		cap_leds->mcl_set = mlxcx_mac_led_set;
1100 		break;
1101 
1102 	case MAC_CAPAB_TRANSCEIVER:
1103 		cap_txr = cap_data;
1104 
1105 		cap_txr->mct_flags = 0;
1106 		cap_txr->mct_ntransceivers = 1;
1107 		cap_txr->mct_info = mlxcx_mac_txr_info;
1108 		cap_txr->mct_read = mlxcx_mac_txr_read;
1109 		break;
1110 
1111 	default:
1112 		return (B_FALSE);
1113 	}
1114 
1115 	return (B_TRUE);
1116 }
1117 
1118 static void
1119 mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1120     mac_prop_info_handle_t prh)
1121 {
1122 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1123 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1124 
1125 	mutex_enter(&port->mlp_mtx);
1126 
1127 	switch (pr_num) {
1128 	case MAC_PROP_DUPLEX:
1129 	case MAC_PROP_SPEED:
1130 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1131 		break;
1132 	case MAC_PROP_MTU:
1133 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1134 		mac_prop_info_set_range_uint32(prh, MLXCX_MTU_OFFSET,
1135 		    port->mlp_max_mtu);
1136 		mac_prop_info_set_default_uint32(prh,
1137 		    port->mlp_mtu - MLXCX_MTU_OFFSET);
1138 		break;
1139 	case MAC_PROP_AUTONEG:
1140 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1141 		mac_prop_info_set_default_uint8(prh, 1);
1142 		break;
1143 	case MAC_PROP_ADV_FEC_CAP:
1144 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1145 		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
1146 		break;
1147 	case MAC_PROP_EN_FEC_CAP:
1148 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1149 		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
1150 		break;
1151 	case MAC_PROP_ADV_100GFDX_CAP:
1152 	case MAC_PROP_EN_100GFDX_CAP:
1153 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1154 		mac_prop_info_set_default_uint8(prh,
1155 		    (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0);
1156 		break;
1157 	case MAC_PROP_ADV_50GFDX_CAP:
1158 	case MAC_PROP_EN_50GFDX_CAP:
1159 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1160 		mac_prop_info_set_default_uint8(prh,
1161 		    (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0);
1162 		break;
1163 	case MAC_PROP_ADV_40GFDX_CAP:
1164 	case MAC_PROP_EN_40GFDX_CAP:
1165 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1166 		mac_prop_info_set_default_uint8(prh,
1167 		    (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0);
1168 		break;
1169 	case MAC_PROP_ADV_25GFDX_CAP:
1170 	case MAC_PROP_EN_25GFDX_CAP:
1171 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1172 		mac_prop_info_set_default_uint8(prh,
1173 		    (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0);
1174 		break;
1175 	case MAC_PROP_ADV_10GFDX_CAP:
1176 	case MAC_PROP_EN_10GFDX_CAP:
1177 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1178 		mac_prop_info_set_default_uint8(prh,
1179 		    (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0);
1180 		break;
1181 	case MAC_PROP_ADV_1000FDX_CAP:
1182 	case MAC_PROP_EN_1000FDX_CAP:
1183 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1184 		mac_prop_info_set_default_uint8(prh,
1185 		    (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0);
1186 		break;
1187 	case MAC_PROP_ADV_100FDX_CAP:
1188 	case MAC_PROP_EN_100FDX_CAP:
1189 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
1190 		mac_prop_info_set_default_uint8(prh,
1191 		    (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0);
1192 		break;
1193 	default:
1194 		break;
1195 	}
1196 
1197 	mutex_exit(&port->mlp_mtx);
1198 }
1199 
1200 static int
1201 mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1202     uint_t pr_valsize, const void *pr_val)
1203 {
1204 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1205 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1206 	int ret = 0;
1207 	uint32_t new_mtu, new_hw_mtu, old_mtu;
1208 	mlxcx_buf_shard_t *sh;
1209 	boolean_t allocd = B_FALSE;
1210 	boolean_t relink = B_FALSE;
1211 	link_fec_t fec;
1212 	mlxcx_pplm_fec_caps_t cap_fec;
1213 
1214 	mutex_enter(&port->mlp_mtx);
1215 
1216 	switch (pr_num) {
1217 	case MAC_PROP_MTU:
1218 		bcopy(pr_val, &new_mtu, sizeof (new_mtu));
1219 		new_hw_mtu = new_mtu + MLXCX_MTU_OFFSET;
1220 		if (new_hw_mtu == port->mlp_mtu)
1221 			break;
1222 		if (new_hw_mtu > port->mlp_max_mtu) {
1223 			ret = EINVAL;
1224 			break;
1225 		}
1226 		sh = list_head(&mlxp->mlx_buf_shards);
1227 		for (; sh != NULL; sh = list_next(&mlxp->mlx_buf_shards, sh)) {
1228 			mutex_enter(&sh->mlbs_mtx);
1229 			if (!list_is_empty(&sh->mlbs_free) ||
1230 			    !list_is_empty(&sh->mlbs_busy) ||
1231 			    !list_is_empty(&sh->mlbs_loaned)) {
1232 				allocd = B_TRUE;
1233 				mutex_exit(&sh->mlbs_mtx);
1234 				break;
1235 			}
1236 			mutex_exit(&sh->mlbs_mtx);
1237 		}
1238 		if (allocd) {
1239 			ret = EBUSY;
1240 			break;
1241 		}
1242 		old_mtu = port->mlp_mtu;
1243 		ret = mac_maxsdu_update(mlxp->mlx_mac_hdl, new_mtu);
1244 		if (ret != 0)
1245 			break;
1246 		port->mlp_mtu = new_hw_mtu;
1247 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, port,
1248 		    MLXCX_MODIFY_NIC_VPORT_CTX_MTU)) {
1249 			port->mlp_mtu = old_mtu;
1250 			(void) mac_maxsdu_update(mlxp->mlx_mac_hdl, old_mtu);
1251 			ret = EIO;
1252 			break;
1253 		}
1254 		if (!mlxcx_cmd_set_port_mtu(mlxp, port)) {
1255 			port->mlp_mtu = old_mtu;
1256 			(void) mac_maxsdu_update(mlxp->mlx_mac_hdl, old_mtu);
1257 			ret = EIO;
1258 			break;
1259 		}
1260 		break;
1261 
1262 	case MAC_PROP_EN_FEC_CAP:
1263 		bcopy(pr_val, &fec, sizeof (fec));
1264 		if (!mlxcx_link_fec_cap(fec, &cap_fec)) {
1265 			ret = EINVAL;
1266 			break;
1267 		}
1268 
1269 		/*
1270 		 * Don't change the FEC if it is already at the requested
1271 		 * setting AND the port is up.
1272 		 * When the port is down, always set the FEC and attempt
1273 		 * to retrain the link.
1274 		 */
1275 		if (fec == port->mlp_fec_requested &&
1276 		    fec == mlxcx_fec_to_link_fec(port->mlp_fec_active) &&
1277 		    port->mlp_oper_status != MLXCX_PORT_STATUS_DOWN)
1278 			break;
1279 
1280 		/*
1281 		 * The most like cause of this failing is an invalid
1282 		 * or unsupported fec option.
1283 		 */
1284 		if (!mlxcx_cmd_modify_port_fec(mlxp, port, cap_fec)) {
1285 			ret = EINVAL;
1286 			break;
1287 		}
1288 
1289 		port->mlp_fec_requested = fec;
1290 
1291 		/*
1292 		 * For FEC to become effective, the link needs to go back
1293 		 * to training and negotiation state. This happens when
1294 		 * the link transitions from down to up, force a relink.
1295 		 */
1296 		relink = B_TRUE;
1297 		break;
1298 
1299 	default:
1300 		ret = ENOTSUP;
1301 		break;
1302 	}
1303 
1304 	if (relink) {
1305 		if (!mlxcx_cmd_modify_port_status(mlxp, port,
1306 		    MLXCX_PORT_STATUS_DOWN) ||
1307 		    !mlxcx_cmd_modify_port_status(mlxp, port,
1308 		    MLXCX_PORT_STATUS_UP)) {
1309 			ret = EIO;
1310 		}
1311 	}
1312 	mutex_exit(&port->mlp_mtx);
1313 
1314 	return (ret);
1315 }
1316 
1317 static int
1318 mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1319     uint_t pr_valsize, void *pr_val)
1320 {
1321 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1322 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
1323 	uint64_t speed;
1324 	int ret = 0;
1325 
1326 	mutex_enter(&port->mlp_mtx);
1327 
1328 	switch (pr_num) {
1329 	case MAC_PROP_DUPLEX:
1330 		if (pr_valsize < sizeof (link_duplex_t)) {
1331 			ret = EOVERFLOW;
1332 			break;
1333 		}
1334 		/* connectx parts only support full duplex */
1335 		*(link_duplex_t *)pr_val = LINK_DUPLEX_FULL;
1336 		break;
1337 	case MAC_PROP_SPEED:
1338 		if (pr_valsize < sizeof (uint64_t)) {
1339 			ret = EOVERFLOW;
1340 			break;
1341 		}
1342 		speed = mlxcx_speed_to_bits(port->mlp_oper_proto);
1343 		bcopy(&speed, pr_val, sizeof (speed));
1344 		break;
1345 	case MAC_PROP_STATUS:
1346 		if (pr_valsize < sizeof (link_state_t)) {
1347 			ret = EOVERFLOW;
1348 			break;
1349 		}
1350 		switch (port->mlp_oper_status) {
1351 		case MLXCX_PORT_STATUS_UP:
1352 		case MLXCX_PORT_STATUS_UP_ONCE:
1353 			*(link_state_t *)pr_val = LINK_STATE_UP;
1354 			break;
1355 		case MLXCX_PORT_STATUS_DOWN:
1356 			*(link_state_t *)pr_val = LINK_STATE_DOWN;
1357 			break;
1358 		default:
1359 			*(link_state_t *)pr_val = LINK_STATE_UNKNOWN;
1360 		}
1361 		break;
1362 	case MAC_PROP_AUTONEG:
1363 		if (pr_valsize < sizeof (uint8_t)) {
1364 			ret = EOVERFLOW;
1365 			break;
1366 		}
1367 		*(uint8_t *)pr_val = port->mlp_autoneg;
1368 		break;
1369 	case MAC_PROP_ADV_FEC_CAP:
1370 		if (pr_valsize < sizeof (link_fec_t)) {
1371 			ret = EOVERFLOW;
1372 			break;
1373 		}
1374 		*(link_fec_t *)pr_val =
1375 		    mlxcx_fec_to_link_fec(port->mlp_fec_active);
1376 		break;
1377 	case MAC_PROP_EN_FEC_CAP:
1378 		if (pr_valsize < sizeof (link_fec_t)) {
1379 			ret = EOVERFLOW;
1380 			break;
1381 		}
1382 		*(link_fec_t *)pr_val = port->mlp_fec_requested;
1383 		break;
1384 	case MAC_PROP_MTU:
1385 		if (pr_valsize < sizeof (uint32_t)) {
1386 			ret = EOVERFLOW;
1387 			break;
1388 		}
1389 		*(uint32_t *)pr_val = port->mlp_mtu - MLXCX_MTU_OFFSET;
1390 		break;
1391 	case MAC_PROP_ADV_100GFDX_CAP:
1392 	case MAC_PROP_EN_100GFDX_CAP:
1393 		if (pr_valsize < sizeof (uint8_t)) {
1394 			ret = EOVERFLOW;
1395 			break;
1396 		}
1397 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1398 		    MLXCX_PROTO_100G) != 0;
1399 		break;
1400 	case MAC_PROP_ADV_50GFDX_CAP:
1401 	case MAC_PROP_EN_50GFDX_CAP:
1402 		if (pr_valsize < sizeof (uint8_t)) {
1403 			ret = EOVERFLOW;
1404 			break;
1405 		}
1406 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1407 		    MLXCX_PROTO_50G) != 0;
1408 		break;
1409 	case MAC_PROP_ADV_40GFDX_CAP:
1410 	case MAC_PROP_EN_40GFDX_CAP:
1411 		if (pr_valsize < sizeof (uint8_t)) {
1412 			ret = EOVERFLOW;
1413 			break;
1414 		}
1415 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1416 		    MLXCX_PROTO_40G) != 0;
1417 		break;
1418 	case MAC_PROP_ADV_25GFDX_CAP:
1419 	case MAC_PROP_EN_25GFDX_CAP:
1420 		if (pr_valsize < sizeof (uint8_t)) {
1421 			ret = EOVERFLOW;
1422 			break;
1423 		}
1424 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1425 		    MLXCX_PROTO_25G) != 0;
1426 		break;
1427 	case MAC_PROP_ADV_10GFDX_CAP:
1428 	case MAC_PROP_EN_10GFDX_CAP:
1429 		if (pr_valsize < sizeof (uint8_t)) {
1430 			ret = EOVERFLOW;
1431 			break;
1432 		}
1433 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1434 		    MLXCX_PROTO_10G) != 0;
1435 		break;
1436 	case MAC_PROP_ADV_1000FDX_CAP:
1437 	case MAC_PROP_EN_1000FDX_CAP:
1438 		if (pr_valsize < sizeof (uint8_t)) {
1439 			ret = EOVERFLOW;
1440 			break;
1441 		}
1442 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1443 		    MLXCX_PROTO_1G) != 0;
1444 		break;
1445 	case MAC_PROP_ADV_100FDX_CAP:
1446 	case MAC_PROP_EN_100FDX_CAP:
1447 		if (pr_valsize < sizeof (uint8_t)) {
1448 			ret = EOVERFLOW;
1449 			break;
1450 		}
1451 		*(uint8_t *)pr_val = (port->mlp_max_proto &
1452 		    MLXCX_PROTO_100M) != 0;
1453 		break;
1454 	default:
1455 		ret = ENOTSUP;
1456 		break;
1457 	}
1458 
1459 	mutex_exit(&port->mlp_mtx);
1460 
1461 	return (ret);
1462 }
1463 
1464 #define	MLXCX_MAC_CALLBACK_FLAGS \
1465 	(MC_GETCAPAB | MC_GETPROP | MC_PROPINFO | MC_SETPROP)
1466 
1467 static mac_callbacks_t mlxcx_mac_callbacks = {
1468 	.mc_callbacks = MLXCX_MAC_CALLBACK_FLAGS,
1469 	.mc_getstat = mlxcx_mac_stat,
1470 	.mc_start = mlxcx_mac_start,
1471 	.mc_stop = mlxcx_mac_stop,
1472 	.mc_setpromisc = mlxcx_mac_setpromisc,
1473 	.mc_multicst = mlxcx_mac_multicast,
1474 	.mc_ioctl = NULL,
1475 	.mc_getcapab = mlxcx_mac_getcapab,
1476 	.mc_setprop = mlxcx_mac_setprop,
1477 	.mc_getprop = mlxcx_mac_getprop,
1478 	.mc_propinfo = mlxcx_mac_propinfo,
1479 	.mc_tx = NULL,
1480 	.mc_unicst = NULL,
1481 };
1482 
1483 boolean_t
1484 mlxcx_register_mac(mlxcx_t *mlxp)
1485 {
1486 	mac_register_t *mac = mac_alloc(MAC_VERSION);
1487 	mlxcx_port_t *port;
1488 	int ret;
1489 
1490 	if (mac == NULL)
1491 		return (B_FALSE);
1492 
1493 	VERIFY3U(mlxp->mlx_nports, ==, 1);
1494 	port = &mlxp->mlx_ports[0];
1495 
1496 	mutex_enter(&port->mlp_mtx);
1497 
1498 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1499 	mac->m_driver = mlxp;
1500 	mac->m_dip = mlxp->mlx_dip;
1501 	mac->m_src_addr = port->mlp_mac_address;
1502 	mac->m_callbacks = &mlxcx_mac_callbacks;
1503 	mac->m_min_sdu = MLXCX_MTU_OFFSET;
1504 	mac->m_max_sdu = port->mlp_mtu - MLXCX_MTU_OFFSET;
1505 	mac->m_margin = VLAN_TAGSZ;
1506 	mac->m_priv_props = mlxcx_priv_props;
1507 	mac->m_v12n = MAC_VIRT_LEVEL1;
1508 
1509 	ret = mac_register(mac, &mlxp->mlx_mac_hdl);
1510 	if (ret != 0) {
1511 		mlxcx_warn(mlxp, "mac_register() returned %d", ret);
1512 	}
1513 	mac_free(mac);
1514 
1515 	mutex_exit(&port->mlp_mtx);
1516 
1517 	mlxcx_update_link_state(mlxp, port);
1518 
1519 	return (ret == 0);
1520 }
1521