xref: /freebsd/sys/net/if_infiniband.c (revision b19cdab3456b361e1ef79651fe1437d8cab4de19)
1 /*-
2  * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include "opt_inet.h"
27 #include "opt_inet6.h"
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/devctl.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/mbuf.h>
38 #include <sys/module.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 
42 #include <net/bpf.h>
43 #include <net/ethernet.h>
44 #include <net/infiniband.h>
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/if_media.h>
49 #include <net/if_lagg.h>
50 #include <net/if_llatbl.h>
51 #include <net/if_types.h>
52 #include <net/netisr.h>
53 #include <net/route.h>
54 #include <netinet/if_ether.h>
55 #include <netinet/in.h>
56 #include <netinet/ip6.h>
57 #include <netinet6/in6_var.h>
58 #include <netinet6/nd6.h>
59 
60 #include <security/mac/mac_framework.h>
61 
62 /* if_lagg(4) support */
63 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
64 
65 #ifdef INET
66 static inline void
67 infiniband_ipv4_multicast_map(uint32_t addr,
68     const uint8_t *broadcast, uint8_t *buf)
69 {
70 	uint8_t scope;
71 
72 	addr = ntohl(addr);
73 	scope = broadcast[5] & 0xF;
74 
75 	buf[0] = 0;
76 	buf[1] = 0xff;
77 	buf[2] = 0xff;
78 	buf[3] = 0xff;
79 	buf[4] = 0xff;
80 	buf[5] = 0x10 | scope;
81 	buf[6] = 0x40;
82 	buf[7] = 0x1b;
83 	buf[8] = broadcast[8];
84 	buf[9] = broadcast[9];
85 	buf[10] = 0;
86 	buf[11] = 0;
87 	buf[12] = 0;
88 	buf[13] = 0;
89 	buf[14] = 0;
90 	buf[15] = 0;
91 	buf[16] = (addr >> 24) & 0xff;
92 	buf[17] = (addr >> 16) & 0xff;
93 	buf[18] = (addr >> 8) & 0xff;
94 	buf[19] = addr & 0xff;
95 }
96 #endif
97 
98 #ifdef INET6
99 static inline void
100 infiniband_ipv6_multicast_map(const struct in6_addr *addr,
101     const uint8_t *broadcast, uint8_t *buf)
102 {
103 	uint8_t scope;
104 
105 	scope = broadcast[5] & 0xF;
106 
107 	buf[0] = 0;
108 	buf[1] = 0xff;
109 	buf[2] = 0xff;
110 	buf[3] = 0xff;
111 	buf[4] = 0xff;
112 	buf[5] = 0x10 | scope;
113 	buf[6] = 0x60;
114 	buf[7] = 0x1b;
115 	buf[8] = broadcast[8];
116 	buf[9] = broadcast[9];
117 	memcpy(&buf[10], &addr->s6_addr[6], 10);
118 }
119 #endif
120 
121 /*
122  * This is for clients that have an infiniband_header in the mbuf.
123  */
124 void
125 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
126 {
127 	struct infiniband_header *ibh;
128 	struct ether_header eh;
129 
130 	if (mb->m_len < sizeof(*ibh))
131 		return;
132 
133 	ibh = mtod(mb, struct infiniband_header *);
134 	eh.ether_type = ibh->ib_protocol;
135 	memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
136 	memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
137 	mb->m_data += sizeof(*ibh);
138 	mb->m_len -= sizeof(*ibh);
139 	mb->m_pkthdr.len -= sizeof(*ibh);
140 	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
141 	mb->m_data -= sizeof(*ibh);
142 	mb->m_len += sizeof(*ibh);
143 	mb->m_pkthdr.len += sizeof(*ibh);
144 }
145 
146 /*
147  * Infiniband output routine.
148  */
149 static int
150 infiniband_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
151     struct route *ro)
152 {
153 	uint8_t edst[INFINIBAND_ADDR_LEN];
154 #if defined(INET) || defined(INET6)
155 	struct llentry *lle = NULL;
156 #endif
157 	struct infiniband_header *ibh;
158 	int error = 0;
159 	uint16_t type;
160 	bool is_gw;
161 
162 	NET_EPOCH_ASSERT();
163 
164 	is_gw = ((ro != NULL) && (ro->ro_flags & RT_HAS_GW) != 0);
165 
166 #ifdef MAC
167 	error = mac_ifnet_check_transmit(ifp, m);
168 	if (error)
169 		goto bad;
170 #endif
171 
172 	M_PROFILE(m);
173 	if (ifp->if_flags & IFF_MONITOR) {
174 		error = ENETDOWN;
175 		goto bad;
176 	}
177 	if (!((ifp->if_flags & IFF_UP) &&
178 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
179 		error = ENETDOWN;
180 		goto bad;
181 	}
182 
183 	switch (dst->sa_family) {
184 	case AF_LINK:
185 		goto output;
186 #ifdef INET
187 	case AF_INET:
188 		if (lle != NULL && (lle->la_flags & LLE_VALID)) {
189 			memcpy(edst, lle->ll_addr, sizeof(edst));
190 		} else if (m->m_flags & M_MCAST) {
191 			infiniband_ipv4_multicast_map(
192 			    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
193 			    ifp->if_broadcastaddr, edst);
194 		} else {
195 			error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL);
196 			if (error) {
197 				if (error == EWOULDBLOCK)
198 					error = 0;
199 				m = NULL; /* mbuf is consumed by resolver */
200 				goto bad;
201 			}
202 		}
203 		type = htons(ETHERTYPE_IP);
204 		break;
205 	case AF_ARP: {
206 		struct arphdr *ah;
207 
208 		if (m->m_len < sizeof(*ah)) {
209 			error = EINVAL;
210 			goto bad;
211 		}
212 
213 		ah = mtod(m, struct arphdr *);
214 
215 		if (m->m_len < arphdr_len(ah)) {
216 			error = EINVAL;
217 			goto bad;
218 		}
219 		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
220 
221 		switch (ntohs(ah->ar_op)) {
222 		case ARPOP_REVREQUEST:
223 		case ARPOP_REVREPLY:
224 			type = htons(ETHERTYPE_REVARP);
225 			break;
226 		case ARPOP_REQUEST:
227 		case ARPOP_REPLY:
228 		default:
229 			type = htons(ETHERTYPE_ARP);
230 			break;
231 		}
232 
233 		if (m->m_flags & M_BCAST) {
234 			memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN);
235 		} else {
236 			if (ah->ar_hln != INFINIBAND_ADDR_LEN) {
237 				error = EINVAL;
238 				goto bad;
239 			}
240 			memcpy(edst, ar_tha(ah), INFINIBAND_ADDR_LEN);
241 		}
242 		break;
243 	}
244 #endif
245 #ifdef INET6
246 	case AF_INET6: {
247 		const struct ip6_hdr *ip6;
248 
249 		ip6 = mtod(m, const struct ip6_hdr *);
250 		if (m->m_len < sizeof(*ip6)) {
251 			error = EINVAL;
252 			goto bad;
253 		} else if (lle != NULL && (lle->la_flags & LLE_VALID)) {
254 			memcpy(edst, lle->ll_addr, sizeof(edst));
255 		} else if (m->m_flags & M_MCAST) {
256 			infiniband_ipv6_multicast_map(
257 			    &((const struct sockaddr_in6 *)dst)->sin6_addr,
258 			    ifp->if_broadcastaddr, edst);
259 		} else if (ip6->ip6_nxt == IPPROTO_ICMPV6) {
260 			memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN);
261 		} else {
262 			error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL);
263 			if (error) {
264 				if (error == EWOULDBLOCK)
265 					error = 0;
266 				m = NULL; /* mbuf is consumed by resolver */
267 				goto bad;
268 			}
269 		}
270 		type = htons(ETHERTYPE_IPV6);
271 		break;
272 	}
273 #endif
274 	default:
275 		error = EAFNOSUPPORT;
276 		goto bad;
277 	}
278 
279 	/*
280 	 * Add local net header.  If no space in first mbuf,
281 	 * allocate another.
282 	 */
283 	M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
284 	if (m == NULL) {
285 		error = ENOBUFS;
286 		goto bad;
287 	}
288 	ibh = mtod(m, struct infiniband_header *);
289 
290 	ibh->ib_protocol = type;
291 	memcpy(ibh->ib_hwaddr, edst, sizeof(edst));
292 
293 	/*
294 	 * Queue message on interface, update output statistics if
295 	 * successful, and start output if interface not yet active.
296 	 */
297 output:
298 	return (ifp->if_transmit(ifp, m));
299 bad:
300 	if (m != NULL)
301 		m_freem(m);
302 	return (error);
303 }
304 
305 /*
306  * Process a received Infiniband packet.
307  */
308 static void
309 infiniband_input(struct ifnet *ifp, struct mbuf *m)
310 {
311 	struct infiniband_header *ibh;
312 	struct epoch_tracker et;
313 	int isr;
314 
315 	CURVNET_SET_QUIET(ifp->if_vnet);
316 
317 	if ((ifp->if_flags & IFF_UP) == 0) {
318 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
319 		m_freem(m);
320 		goto done;
321 	}
322 
323 	ibh = mtod(m, struct infiniband_header *);
324 
325 	/*
326 	 * Reset layer specific mbuf flags to avoid confusing upper
327 	 * layers:
328 	 */
329 	m->m_flags &= ~M_VLANTAG;
330 	m_clrprotoflags(m);
331 
332 	if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
333 		if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
334 		    ifp->if_addrlen) == 0)
335 			m->m_flags |= M_BCAST;
336 		else
337 			m->m_flags |= M_MCAST;
338 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
339 	}
340 
341 	/* Let BPF have it before we strip the header. */
342 	INFINIBAND_BPF_MTAP(ifp, m);
343 
344 	/* Allow monitor mode to claim this frame, after stats are updated. */
345 	if (ifp->if_flags & IFF_MONITOR) {
346 		m_freem(m);
347 		goto done;
348 	}
349 
350 	/* Direct packet to correct FIB based on interface config. */
351 	M_SETFIB(m, ifp->if_fib);
352 
353 	/* Handle input from a lagg<N> port */
354 	if (ifp->if_type == IFT_INFINIBANDLAG) {
355 		KASSERT(lagg_input_infiniband_p != NULL,
356 		    ("%s: if_lagg not loaded!", __func__));
357 		m = (*lagg_input_infiniband_p)(ifp, m);
358 		if (__predict_false(m == NULL))
359 			goto done;
360 		ifp = m->m_pkthdr.rcvif;
361 	}
362 
363 	/*
364 	 * Dispatch frame to upper layer.
365 	 */
366 	switch (ibh->ib_protocol) {
367 #ifdef INET
368 	case htons(ETHERTYPE_IP):
369 		isr = NETISR_IP;
370 		break;
371 
372 	case htons(ETHERTYPE_ARP):
373 		if (ifp->if_flags & IFF_NOARP) {
374 			/* Discard packet if ARP is disabled on interface */
375 			m_freem(m);
376 			goto done;
377 		}
378 		isr = NETISR_ARP;
379 		break;
380 #endif
381 #ifdef INET6
382 	case htons(ETHERTYPE_IPV6):
383 		isr = NETISR_IPV6;
384 		break;
385 #endif
386 	default:
387 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
388 		m_freem(m);
389 		goto done;
390 	}
391 
392 	/* Strip off the Infiniband header. */
393 	m_adj(m, INFINIBAND_HDR_LEN);
394 
395 #ifdef MAC
396 	/*
397 	 * Tag the mbuf with an appropriate MAC label before any other
398 	 * consumers can get to it.
399 	 */
400 	mac_ifnet_create_mbuf(ifp, m);
401 #endif
402 	/* Allow monitor mode to claim this frame, after stats are updated. */
403 	NET_EPOCH_ENTER(et);
404 	netisr_dispatch(isr, m);
405 	NET_EPOCH_EXIT(et);
406 done:
407 	CURVNET_RESTORE();
408 }
409 
410 static int
411 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
412     struct sockaddr *sa)
413 {
414 	struct sockaddr_dl *sdl;
415 #ifdef INET
416 	struct sockaddr_in *sin;
417 #endif
418 #ifdef INET6
419 	struct sockaddr_in6 *sin6;
420 #endif
421 	uint8_t *e_addr;
422 
423 	switch (sa->sa_family) {
424 	case AF_LINK:
425 		/*
426 		 * No mapping needed. Just check that it's a valid MC address.
427 		 */
428 		sdl = (struct sockaddr_dl *)sa;
429 		e_addr = LLADDR(sdl);
430 		if (!INFINIBAND_IS_MULTICAST(e_addr))
431 			return (EADDRNOTAVAIL);
432 		*llsa = NULL;
433 		return 0;
434 
435 #ifdef INET
436 	case AF_INET:
437 		sin = (struct sockaddr_in *)sa;
438 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
439 			return (EADDRNOTAVAIL);
440 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
441 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
442 		e_addr = LLADDR(sdl);
443 		infiniband_ipv4_multicast_map(
444 		    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
445 		*llsa = (struct sockaddr *)sdl;
446 		return (0);
447 #endif
448 #ifdef INET6
449 	case AF_INET6:
450 		sin6 = (struct sockaddr_in6 *)sa;
451 		/*
452 		 * An IP6 address of 0 means listen to all of the
453 		 * multicast address used for IP6. This has no meaning
454 		 * in infiniband.
455 		 */
456 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
457 			return (EADDRNOTAVAIL);
458 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
459 			return (EADDRNOTAVAIL);
460 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
461 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
462 		e_addr = LLADDR(sdl);
463 		infiniband_ipv6_multicast_map(
464 		    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
465 		*llsa = (struct sockaddr *)sdl;
466 		return (0);
467 #endif
468 	default:
469 		return (EAFNOSUPPORT);
470 	}
471 }
472 
473 void
474 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
475 {
476 	struct sockaddr_dl *sdl;
477 	struct ifaddr *ifa;
478 	int i;
479 
480 	ifp->if_addrlen = INFINIBAND_ADDR_LEN;
481 	ifp->if_hdrlen = INFINIBAND_HDR_LEN;
482 	ifp->if_mtu = INFINIBAND_MTU;
483 	if_attach(ifp);
484 	ifp->if_output = infiniband_output;
485 	ifp->if_input = infiniband_input;
486 	ifp->if_resolvemulti = infiniband_resolvemulti;
487 
488 	if (ifp->if_baudrate == 0)
489 		ifp->if_baudrate = IF_Gbps(10); /* default value */
490 	if (llb != NULL)
491 		ifp->if_broadcastaddr = llb;
492 
493 	ifa = ifp->if_addr;
494 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
495 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
496 	sdl->sdl_type = IFT_INFINIBAND;
497 	sdl->sdl_alen = ifp->if_addrlen;
498 
499 	if (lla != NULL) {
500 		memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
501 
502 		if (ifp->if_hw_addr != NULL)
503 			memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
504 	} else {
505 		lla = LLADDR(sdl);
506 	}
507 
508 	/* Attach ethernet compatible network device */
509 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
510 
511 	/* Announce Infiniband MAC address if non-zero. */
512 	for (i = 0; i < ifp->if_addrlen; i++)
513 		if (lla[i] != 0)
514 			break;
515 	if (i != ifp->if_addrlen)
516 		if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
517 
518 	/* Add necessary bits are setup; announce it now. */
519 	EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
520 
521 	if (IS_DEFAULT_VNET(curvnet))
522 		devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
523 }
524 
525 /*
526  * Perform common duties while detaching an Infiniband interface
527  */
528 void
529 infiniband_ifdetach(struct ifnet *ifp)
530 {
531 	bpfdetach(ifp);
532 	if_detach(ifp);
533 }
534 
535 static int
536 infiniband_modevent(module_t mod, int type, void *data)
537 {
538 	switch (type) {
539 	case MOD_LOAD:
540 	case MOD_UNLOAD:
541 		return (0);
542 	default:
543 		return (EOPNOTSUPP);
544 	}
545 }
546 
547 static moduledata_t infiniband_mod = {
548 	.name = "if_infiniband",
549 	.evhand = &infiniband_modevent,
550 };
551 
552 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
553 MODULE_VERSION(if_infiniband, 1);
554