1 /*- 2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include "opt_inet.h" 27 #include "opt_inet6.h" 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/devctl.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/mbuf.h> 38 #include <sys/module.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 42 #include <net/bpf.h> 43 #include <net/ethernet.h> 44 #include <net/infiniband.h> 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_dl.h> 48 #include <net/if_media.h> 49 #include <net/if_lagg.h> 50 #include <net/if_llatbl.h> 51 #include <net/if_types.h> 52 #include <net/netisr.h> 53 #include <net/route.h> 54 #include <netinet/if_ether.h> 55 #include <netinet/in.h> 56 #include <netinet/ip6.h> 57 #include <netinet6/in6_var.h> 58 #include <netinet6/nd6.h> 59 60 #include <security/mac/mac_framework.h> 61 62 /* if_lagg(4) support */ 63 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); 64 65 #ifdef INET 66 static inline void 67 infiniband_ipv4_multicast_map(uint32_t addr, 68 const uint8_t *broadcast, uint8_t *buf) 69 { 70 uint8_t scope; 71 72 addr = ntohl(addr); 73 scope = broadcast[5] & 0xF; 74 75 buf[0] = 0; 76 buf[1] = 0xff; 77 buf[2] = 0xff; 78 buf[3] = 0xff; 79 buf[4] = 0xff; 80 buf[5] = 0x10 | scope; 81 buf[6] = 0x40; 82 buf[7] = 0x1b; 83 buf[8] = broadcast[8]; 84 buf[9] = broadcast[9]; 85 buf[10] = 0; 86 buf[11] = 0; 87 buf[12] = 0; 88 buf[13] = 0; 89 buf[14] = 0; 90 buf[15] = 0; 91 buf[16] = (addr >> 24) & 0xff; 92 buf[17] = (addr >> 16) & 0xff; 93 buf[18] = (addr >> 8) & 0xff; 94 buf[19] = addr & 0xff; 95 } 96 #endif 97 98 #ifdef INET6 99 static inline void 100 infiniband_ipv6_multicast_map(const struct in6_addr *addr, 101 const uint8_t *broadcast, uint8_t *buf) 102 { 103 uint8_t scope; 104 105 scope = broadcast[5] & 0xF; 106 107 buf[0] = 0; 108 buf[1] = 0xff; 109 buf[2] = 0xff; 110 buf[3] = 0xff; 111 buf[4] = 0xff; 112 buf[5] = 0x10 | scope; 113 buf[6] = 0x60; 114 buf[7] = 0x1b; 115 buf[8] = broadcast[8]; 116 buf[9] = broadcast[9]; 117 memcpy(&buf[10], &addr->s6_addr[6], 10); 118 } 119 #endif 120 121 /* 122 * This is for clients that have an infiniband_header in the mbuf. 123 */ 124 void 125 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) 126 { 127 struct infiniband_header *ibh; 128 struct ether_header eh; 129 130 if (mb->m_len < sizeof(*ibh)) 131 return; 132 133 ibh = mtod(mb, struct infiniband_header *); 134 eh.ether_type = ibh->ib_protocol; 135 memset(eh.ether_shost, 0, ETHER_ADDR_LEN); 136 memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); 137 mb->m_data += sizeof(*ibh); 138 mb->m_len -= sizeof(*ibh); 139 mb->m_pkthdr.len -= sizeof(*ibh); 140 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); 141 mb->m_data -= sizeof(*ibh); 142 mb->m_len += sizeof(*ibh); 143 mb->m_pkthdr.len += sizeof(*ibh); 144 } 145 146 static void 147 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) 148 { 149 int csum_flags = 0; 150 151 if (src->m_pkthdr.csum_flags & CSUM_IP) 152 csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); 153 if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 154 csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); 155 if (src->m_pkthdr.csum_flags & CSUM_SCTP) 156 csum_flags |= CSUM_SCTP_VALID; 157 dst->m_pkthdr.csum_flags |= csum_flags; 158 if (csum_flags & CSUM_DATA_VALID) 159 dst->m_pkthdr.csum_data = 0xffff; 160 } 161 162 /* 163 * Handle link-layer encapsulation requests. 164 */ 165 static int 166 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) 167 { 168 struct infiniband_header *ih; 169 struct arphdr *ah; 170 uint16_t etype; 171 const uint8_t *lladdr; 172 173 if (req->rtype != IFENCAP_LL) 174 return (EOPNOTSUPP); 175 176 if (req->bufsize < INFINIBAND_HDR_LEN) 177 return (ENOMEM); 178 179 ih = (struct infiniband_header *)req->buf; 180 lladdr = req->lladdr; 181 req->lladdr_off = 0; 182 183 switch (req->family) { 184 case AF_INET: 185 etype = htons(ETHERTYPE_IP); 186 break; 187 case AF_INET6: 188 etype = htons(ETHERTYPE_IPV6); 189 break; 190 case AF_ARP: 191 ah = (struct arphdr *)req->hdata; 192 ah->ar_hrd = htons(ARPHRD_INFINIBAND); 193 194 switch (ntohs(ah->ar_op)) { 195 case ARPOP_REVREQUEST: 196 case ARPOP_REVREPLY: 197 etype = htons(ETHERTYPE_REVARP); 198 break; 199 case ARPOP_REQUEST: 200 case ARPOP_REPLY: 201 default: 202 etype = htons(ETHERTYPE_ARP); 203 break; 204 } 205 206 if (req->flags & IFENCAP_FLAG_BROADCAST) 207 lladdr = ifp->if_broadcastaddr; 208 break; 209 default: 210 return (EAFNOSUPPORT); 211 } 212 213 ih->ib_protocol = etype; 214 ih->ib_reserved = 0; 215 memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); 216 req->bufsize = sizeof(struct infiniband_header); 217 218 return (0); 219 } 220 221 static int 222 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, 223 const struct sockaddr *dst, struct route *ro, uint8_t *phdr, 224 uint32_t *pflags, struct llentry **plle) 225 { 226 struct infiniband_header *ih; 227 uint32_t lleflags = 0; 228 int error = 0; 229 230 if (plle) 231 *plle = NULL; 232 ih = (struct infiniband_header *)phdr; 233 234 switch (dst->sa_family) { 235 #ifdef INET 236 case AF_INET: 237 if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { 238 error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); 239 } else { 240 if (m->m_flags & M_BCAST) { 241 memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, 242 INFINIBAND_ADDR_LEN); 243 } else { 244 infiniband_ipv4_multicast_map( 245 ((const struct sockaddr_in *)dst)->sin_addr.s_addr, 246 ifp->if_broadcastaddr, ih->ib_hwaddr); 247 } 248 ih->ib_protocol = htons(ETHERTYPE_IP); 249 ih->ib_reserved = 0; 250 } 251 break; 252 #endif 253 #ifdef INET6 254 case AF_INET6: 255 if ((m->m_flags & M_MCAST) == 0) { 256 error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); 257 } else { 258 infiniband_ipv6_multicast_map( 259 &((const struct sockaddr_in6 *)dst)->sin6_addr, 260 ifp->if_broadcastaddr, ih->ib_hwaddr); 261 ih->ib_protocol = htons(ETHERTYPE_IPV6); 262 ih->ib_reserved = 0; 263 } 264 break; 265 #endif 266 default: 267 if_printf(ifp, "can't handle af%d\n", dst->sa_family); 268 if (m != NULL) 269 m_freem(m); 270 return (EAFNOSUPPORT); 271 } 272 273 if (error == EHOSTDOWN) { 274 if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) 275 error = EHOSTUNREACH; 276 } 277 278 if (error != 0) 279 return (error); 280 281 *pflags = RT_MAY_LOOP; 282 if (lleflags & LLE_IFADDR) 283 *pflags |= RT_L2_ME; 284 285 return (0); 286 } 287 288 /* 289 * Infiniband output routine. 290 */ 291 static int 292 infiniband_output(struct ifnet *ifp, struct mbuf *m, 293 const struct sockaddr *dst, struct route *ro) 294 { 295 uint8_t linkhdr[INFINIBAND_HDR_LEN]; 296 uint8_t *phdr; 297 struct llentry *lle = NULL; 298 struct infiniband_header *ih; 299 int error = 0; 300 int hlen; /* link layer header length */ 301 uint32_t pflags; 302 bool addref; 303 304 NET_EPOCH_ASSERT(); 305 306 addref = false; 307 phdr = NULL; 308 pflags = 0; 309 if (ro != NULL) { 310 /* XXX BPF uses ro_prepend */ 311 if (ro->ro_prepend != NULL) { 312 phdr = ro->ro_prepend; 313 hlen = ro->ro_plen; 314 } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { 315 if ((ro->ro_flags & RT_LLE_CACHE) != 0) { 316 lle = ro->ro_lle; 317 if (lle != NULL && 318 (lle->la_flags & LLE_VALID) == 0) { 319 LLE_FREE(lle); 320 lle = NULL; /* redundant */ 321 ro->ro_lle = NULL; 322 } 323 if (lle == NULL) { 324 /* if we lookup, keep cache */ 325 addref = 1; 326 } else 327 /* 328 * Notify LLE code that 329 * the entry was used 330 * by datapath. 331 */ 332 llentry_mark_used(lle); 333 } 334 if (lle != NULL) { 335 phdr = lle->r_linkdata; 336 hlen = lle->r_hdrlen; 337 pflags = lle->r_flags; 338 } 339 } 340 } 341 342 #ifdef MAC 343 error = mac_ifnet_check_transmit(ifp, m); 344 if (error) 345 goto bad; 346 #endif 347 348 M_PROFILE(m); 349 if (ifp->if_flags & IFF_MONITOR) { 350 error = ENETDOWN; 351 goto bad; 352 } 353 if (!((ifp->if_flags & IFF_UP) && 354 (ifp->if_drv_flags & IFF_DRV_RUNNING))) { 355 error = ENETDOWN; 356 goto bad; 357 } 358 359 if (phdr == NULL) { 360 /* No prepend data supplied. Try to calculate ourselves. */ 361 phdr = linkhdr; 362 hlen = INFINIBAND_HDR_LEN; 363 error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, 364 addref ? &lle : NULL); 365 if (addref && lle != NULL) 366 ro->ro_lle = lle; 367 if (error != 0) 368 return (error == EWOULDBLOCK ? 0 : error); 369 } 370 371 if ((pflags & RT_L2_ME) != 0) { 372 update_mbuf_csumflags(m, m); 373 return (if_simloop(ifp, m, dst->sa_family, 0)); 374 } 375 376 /* 377 * Add local infiniband header. If no space in first mbuf, 378 * allocate another. 379 */ 380 M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); 381 if (m == NULL) { 382 error = ENOBUFS; 383 goto bad; 384 } 385 if ((pflags & RT_HAS_HEADER) == 0) { 386 ih = mtod(m, struct infiniband_header *); 387 memcpy(ih, phdr, hlen); 388 } 389 390 /* 391 * Queue message on interface, update output statistics if 392 * successful, and start output if interface not yet active. 393 */ 394 return (ifp->if_transmit(ifp, m)); 395 bad: 396 if (m != NULL) 397 m_freem(m); 398 return (error); 399 } 400 401 /* 402 * Process a received Infiniband packet. 403 */ 404 static void 405 infiniband_input(struct ifnet *ifp, struct mbuf *m) 406 { 407 struct infiniband_header *ibh; 408 struct epoch_tracker et; 409 int isr; 410 411 CURVNET_SET_QUIET(ifp->if_vnet); 412 413 if ((ifp->if_flags & IFF_UP) == 0) { 414 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 415 m_freem(m); 416 goto done; 417 } 418 419 ibh = mtod(m, struct infiniband_header *); 420 421 /* 422 * Reset layer specific mbuf flags to avoid confusing upper 423 * layers: 424 */ 425 m->m_flags &= ~M_VLANTAG; 426 m_clrprotoflags(m); 427 428 if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { 429 if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, 430 ifp->if_addrlen) == 0) 431 m->m_flags |= M_BCAST; 432 else 433 m->m_flags |= M_MCAST; 434 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); 435 } 436 437 /* Let BPF have it before we strip the header. */ 438 INFINIBAND_BPF_MTAP(ifp, m); 439 440 /* Allow monitor mode to claim this frame, after stats are updated. */ 441 if (ifp->if_flags & IFF_MONITOR) { 442 m_freem(m); 443 goto done; 444 } 445 446 /* Direct packet to correct FIB based on interface config. */ 447 M_SETFIB(m, ifp->if_fib); 448 449 /* Handle input from a lagg<N> port */ 450 if (ifp->if_type == IFT_INFINIBANDLAG) { 451 KASSERT(lagg_input_infiniband_p != NULL, 452 ("%s: if_lagg not loaded!", __func__)); 453 m = (*lagg_input_infiniband_p)(ifp, m); 454 if (__predict_false(m == NULL)) 455 goto done; 456 ifp = m->m_pkthdr.rcvif; 457 } 458 459 /* 460 * Dispatch frame to upper layer. 461 */ 462 switch (ibh->ib_protocol) { 463 #ifdef INET 464 case htons(ETHERTYPE_IP): 465 isr = NETISR_IP; 466 break; 467 468 case htons(ETHERTYPE_ARP): 469 if (ifp->if_flags & IFF_NOARP) { 470 /* Discard packet if ARP is disabled on interface */ 471 m_freem(m); 472 goto done; 473 } 474 isr = NETISR_ARP; 475 break; 476 #endif 477 #ifdef INET6 478 case htons(ETHERTYPE_IPV6): 479 isr = NETISR_IPV6; 480 break; 481 #endif 482 default: 483 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 484 m_freem(m); 485 goto done; 486 } 487 488 /* Strip off the Infiniband header. */ 489 m_adj(m, INFINIBAND_HDR_LEN); 490 491 #ifdef MAC 492 /* 493 * Tag the mbuf with an appropriate MAC label before any other 494 * consumers can get to it. 495 */ 496 mac_ifnet_create_mbuf(ifp, m); 497 #endif 498 /* Allow monitor mode to claim this frame, after stats are updated. */ 499 NET_EPOCH_ENTER(et); 500 netisr_dispatch(isr, m); 501 NET_EPOCH_EXIT(et); 502 done: 503 CURVNET_RESTORE(); 504 } 505 506 static int 507 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, 508 struct sockaddr *sa) 509 { 510 struct sockaddr_dl *sdl; 511 #ifdef INET 512 struct sockaddr_in *sin; 513 #endif 514 #ifdef INET6 515 struct sockaddr_in6 *sin6; 516 #endif 517 uint8_t *e_addr; 518 519 switch (sa->sa_family) { 520 case AF_LINK: 521 /* 522 * No mapping needed. Just check that it's a valid MC address. 523 */ 524 sdl = (struct sockaddr_dl *)sa; 525 e_addr = LLADDR(sdl); 526 if (!INFINIBAND_IS_MULTICAST(e_addr)) 527 return (EADDRNOTAVAIL); 528 *llsa = NULL; 529 return 0; 530 531 #ifdef INET 532 case AF_INET: 533 sin = (struct sockaddr_in *)sa; 534 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 535 return (EADDRNOTAVAIL); 536 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 537 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 538 e_addr = LLADDR(sdl); 539 infiniband_ipv4_multicast_map( 540 sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); 541 *llsa = (struct sockaddr *)sdl; 542 return (0); 543 #endif 544 #ifdef INET6 545 case AF_INET6: 546 sin6 = (struct sockaddr_in6 *)sa; 547 /* 548 * An IP6 address of 0 means listen to all of the 549 * multicast address used for IP6. This has no meaning 550 * in infiniband. 551 */ 552 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 553 return (EADDRNOTAVAIL); 554 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) 555 return (EADDRNOTAVAIL); 556 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 557 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 558 e_addr = LLADDR(sdl); 559 infiniband_ipv6_multicast_map( 560 &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); 561 *llsa = (struct sockaddr *)sdl; 562 return (0); 563 #endif 564 default: 565 return (EAFNOSUPPORT); 566 } 567 } 568 569 void 570 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) 571 { 572 struct sockaddr_dl *sdl; 573 struct ifaddr *ifa; 574 int i; 575 576 ifp->if_addrlen = INFINIBAND_ADDR_LEN; 577 ifp->if_hdrlen = INFINIBAND_HDR_LEN; 578 ifp->if_mtu = INFINIBAND_MTU; 579 if_attach(ifp); 580 ifp->if_output = infiniband_output; 581 ifp->if_input = infiniband_input; 582 ifp->if_resolvemulti = infiniband_resolvemulti; 583 ifp->if_requestencap = infiniband_requestencap; 584 585 if (ifp->if_baudrate == 0) 586 ifp->if_baudrate = IF_Gbps(10); /* default value */ 587 if (llb != NULL) 588 ifp->if_broadcastaddr = llb; 589 590 ifa = ifp->if_addr; 591 KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); 592 sdl = (struct sockaddr_dl *)ifa->ifa_addr; 593 sdl->sdl_type = IFT_INFINIBAND; 594 sdl->sdl_alen = ifp->if_addrlen; 595 596 if (lla != NULL) { 597 memcpy(LLADDR(sdl), lla, ifp->if_addrlen); 598 599 if (ifp->if_hw_addr != NULL) 600 memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); 601 } else { 602 lla = LLADDR(sdl); 603 } 604 605 /* Attach ethernet compatible network device */ 606 bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); 607 608 /* Announce Infiniband MAC address if non-zero. */ 609 for (i = 0; i < ifp->if_addrlen; i++) 610 if (lla[i] != 0) 611 break; 612 if (i != ifp->if_addrlen) 613 if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); 614 615 /* Add necessary bits are setup; announce it now. */ 616 EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); 617 618 if (IS_DEFAULT_VNET(curvnet)) 619 devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); 620 } 621 622 /* 623 * Perform common duties while detaching an Infiniband interface 624 */ 625 void 626 infiniband_ifdetach(struct ifnet *ifp) 627 { 628 bpfdetach(ifp); 629 if_detach(ifp); 630 } 631 632 static int 633 infiniband_modevent(module_t mod, int type, void *data) 634 { 635 switch (type) { 636 case MOD_LOAD: 637 case MOD_UNLOAD: 638 return (0); 639 default: 640 return (EOPNOTSUPP); 641 } 642 } 643 644 static moduledata_t infiniband_mod = { 645 .name = "if_infiniband", 646 .evhand = &infiniband_modevent, 647 }; 648 649 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); 650 MODULE_VERSION(if_infiniband, 1); 651