1 /*- 2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include "opt_inet.h" 27 #include "opt_inet6.h" 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/devctl.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/mbuf.h> 38 #include <sys/module.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 42 #include <net/bpf.h> 43 #include <net/ethernet.h> 44 #include <net/infiniband.h> 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_dl.h> 48 #include <net/if_media.h> 49 #include <net/if_lagg.h> 50 #include <net/if_llatbl.h> 51 #include <net/if_types.h> 52 #include <net/netisr.h> 53 #include <net/route.h> 54 #include <netinet/if_ether.h> 55 #include <netinet/in.h> 56 #include <netinet/ip6.h> 57 #include <netinet6/in6_var.h> 58 #include <netinet6/nd6.h> 59 60 #include <security/mac/mac_framework.h> 61 62 /* if_lagg(4) support */ 63 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); 64 65 #ifdef INET 66 static inline void 67 infiniband_ipv4_multicast_map(uint32_t addr, 68 const uint8_t *broadcast, uint8_t *buf) 69 { 70 uint8_t scope; 71 72 addr = ntohl(addr); 73 scope = broadcast[5] & 0xF; 74 75 buf[0] = 0; 76 buf[1] = 0xff; 77 buf[2] = 0xff; 78 buf[3] = 0xff; 79 buf[4] = 0xff; 80 buf[5] = 0x10 | scope; 81 buf[6] = 0x40; 82 buf[7] = 0x1b; 83 buf[8] = broadcast[8]; 84 buf[9] = broadcast[9]; 85 buf[10] = 0; 86 buf[11] = 0; 87 buf[12] = 0; 88 buf[13] = 0; 89 buf[14] = 0; 90 buf[15] = 0; 91 buf[16] = (addr >> 24) & 0xff; 92 buf[17] = (addr >> 16) & 0xff; 93 buf[18] = (addr >> 8) & 0xff; 94 buf[19] = addr & 0xff; 95 } 96 #endif 97 98 #ifdef INET6 99 static inline void 100 infiniband_ipv6_multicast_map(const struct in6_addr *addr, 101 const uint8_t *broadcast, uint8_t *buf) 102 { 103 uint8_t scope; 104 105 scope = broadcast[5] & 0xF; 106 107 buf[0] = 0; 108 buf[1] = 0xff; 109 buf[2] = 0xff; 110 buf[3] = 0xff; 111 buf[4] = 0xff; 112 buf[5] = 0x10 | scope; 113 buf[6] = 0x60; 114 buf[7] = 0x1b; 115 buf[8] = broadcast[8]; 116 buf[9] = broadcast[9]; 117 memcpy(&buf[10], &addr->s6_addr[6], 10); 118 } 119 #endif 120 121 /* 122 * This is for clients that have an infiniband_header in the mbuf. 123 */ 124 void 125 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) 126 { 127 struct infiniband_header *ibh; 128 struct ether_header eh; 129 130 if (mb->m_len < sizeof(*ibh)) 131 return; 132 133 ibh = mtod(mb, struct infiniband_header *); 134 eh.ether_type = ibh->ib_protocol; 135 memset(eh.ether_shost, 0, ETHER_ADDR_LEN); 136 memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); 137 mb->m_data += sizeof(*ibh); 138 mb->m_len -= sizeof(*ibh); 139 mb->m_pkthdr.len -= sizeof(*ibh); 140 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); 141 mb->m_data -= sizeof(*ibh); 142 mb->m_len += sizeof(*ibh); 143 mb->m_pkthdr.len += sizeof(*ibh); 144 } 145 146 static void 147 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) 148 { 149 int csum_flags = 0; 150 151 if (src->m_pkthdr.csum_flags & CSUM_IP) 152 csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); 153 if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 154 csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); 155 if (src->m_pkthdr.csum_flags & CSUM_SCTP) 156 csum_flags |= CSUM_SCTP_VALID; 157 dst->m_pkthdr.csum_flags |= csum_flags; 158 if (csum_flags & CSUM_DATA_VALID) 159 dst->m_pkthdr.csum_data = 0xffff; 160 } 161 162 /* 163 * Handle link-layer encapsulation requests. 164 */ 165 static int 166 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) 167 { 168 struct infiniband_header *ih; 169 struct arphdr *ah; 170 uint16_t etype; 171 const uint8_t *lladdr; 172 173 if (req->rtype != IFENCAP_LL) 174 return (EOPNOTSUPP); 175 176 if (req->bufsize < INFINIBAND_HDR_LEN) 177 return (ENOMEM); 178 179 ih = (struct infiniband_header *)req->buf; 180 lladdr = req->lladdr; 181 req->lladdr_off = 0; 182 183 switch (req->family) { 184 case AF_INET: 185 etype = htons(ETHERTYPE_IP); 186 break; 187 case AF_INET6: 188 etype = htons(ETHERTYPE_IPV6); 189 break; 190 case AF_ARP: 191 ah = (struct arphdr *)req->hdata; 192 ah->ar_hrd = htons(ARPHRD_INFINIBAND); 193 194 switch (ntohs(ah->ar_op)) { 195 case ARPOP_REVREQUEST: 196 case ARPOP_REVREPLY: 197 etype = htons(ETHERTYPE_REVARP); 198 break; 199 case ARPOP_REQUEST: 200 case ARPOP_REPLY: 201 default: 202 etype = htons(ETHERTYPE_ARP); 203 break; 204 } 205 206 if (req->flags & IFENCAP_FLAG_BROADCAST) 207 lladdr = ifp->if_broadcastaddr; 208 break; 209 default: 210 return (EAFNOSUPPORT); 211 } 212 213 ih->ib_protocol = etype; 214 ih->ib_reserved = 0; 215 memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); 216 req->bufsize = sizeof(struct infiniband_header); 217 218 return (0); 219 } 220 221 static int 222 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, 223 const struct sockaddr *dst, struct route *ro, uint8_t *phdr, 224 uint32_t *pflags, struct llentry **plle) 225 { 226 #if defined(INET) || defined(INET6) 227 struct infiniband_header *ih = (struct infiniband_header *)phdr; 228 #endif 229 uint32_t lleflags = 0; 230 int error = 0; 231 232 if (plle) 233 *plle = NULL; 234 235 switch (dst->sa_family) { 236 #ifdef INET 237 case AF_INET: 238 if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { 239 error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); 240 } else { 241 if (m->m_flags & M_BCAST) { 242 memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, 243 INFINIBAND_ADDR_LEN); 244 } else { 245 infiniband_ipv4_multicast_map( 246 ((const struct sockaddr_in *)dst)->sin_addr.s_addr, 247 ifp->if_broadcastaddr, ih->ib_hwaddr); 248 } 249 ih->ib_protocol = htons(ETHERTYPE_IP); 250 ih->ib_reserved = 0; 251 } 252 break; 253 #endif 254 #ifdef INET6 255 case AF_INET6: 256 if ((m->m_flags & M_MCAST) == 0) { 257 int af = RO_GET_FAMILY(ro, dst); 258 error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr, 259 &lleflags, plle); 260 } else { 261 infiniband_ipv6_multicast_map( 262 &((const struct sockaddr_in6 *)dst)->sin6_addr, 263 ifp->if_broadcastaddr, ih->ib_hwaddr); 264 ih->ib_protocol = htons(ETHERTYPE_IPV6); 265 ih->ib_reserved = 0; 266 } 267 break; 268 #endif 269 default: 270 if_printf(ifp, "can't handle af%d\n", dst->sa_family); 271 if (m != NULL) 272 m_freem(m); 273 return (EAFNOSUPPORT); 274 } 275 276 if (error == EHOSTDOWN) { 277 if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) 278 error = EHOSTUNREACH; 279 } 280 281 if (error != 0) 282 return (error); 283 284 *pflags = RT_MAY_LOOP; 285 if (lleflags & LLE_IFADDR) 286 *pflags |= RT_L2_ME; 287 288 return (0); 289 } 290 291 /* 292 * Infiniband output routine. 293 */ 294 static int 295 infiniband_output(struct ifnet *ifp, struct mbuf *m, 296 const struct sockaddr *dst, struct route *ro) 297 { 298 uint8_t linkhdr[INFINIBAND_HDR_LEN]; 299 uint8_t *phdr; 300 struct llentry *lle = NULL; 301 struct infiniband_header *ih; 302 int error = 0; 303 int hlen; /* link layer header length */ 304 uint32_t pflags; 305 bool addref; 306 307 NET_EPOCH_ASSERT(); 308 309 addref = false; 310 phdr = NULL; 311 pflags = 0; 312 if (ro != NULL) { 313 /* XXX BPF uses ro_prepend */ 314 if (ro->ro_prepend != NULL) { 315 phdr = ro->ro_prepend; 316 hlen = ro->ro_plen; 317 } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { 318 if ((ro->ro_flags & RT_LLE_CACHE) != 0) { 319 lle = ro->ro_lle; 320 if (lle != NULL && 321 (lle->la_flags & LLE_VALID) == 0) { 322 LLE_FREE(lle); 323 lle = NULL; /* redundant */ 324 ro->ro_lle = NULL; 325 } 326 if (lle == NULL) { 327 /* if we lookup, keep cache */ 328 addref = 1; 329 } else 330 /* 331 * Notify LLE code that 332 * the entry was used 333 * by datapath. 334 */ 335 llentry_provide_feedback(lle); 336 } 337 if (lle != NULL) { 338 phdr = lle->r_linkdata; 339 hlen = lle->r_hdrlen; 340 pflags = lle->r_flags; 341 } 342 } 343 } 344 345 #ifdef MAC 346 error = mac_ifnet_check_transmit(ifp, m); 347 if (error) 348 goto bad; 349 #endif 350 351 M_PROFILE(m); 352 if (ifp->if_flags & IFF_MONITOR) { 353 error = ENETDOWN; 354 goto bad; 355 } 356 if (!((ifp->if_flags & IFF_UP) && 357 (ifp->if_drv_flags & IFF_DRV_RUNNING))) { 358 error = ENETDOWN; 359 goto bad; 360 } 361 362 if (phdr == NULL) { 363 /* No prepend data supplied. Try to calculate ourselves. */ 364 phdr = linkhdr; 365 hlen = INFINIBAND_HDR_LEN; 366 error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, 367 addref ? &lle : NULL); 368 if (addref && lle != NULL) 369 ro->ro_lle = lle; 370 if (error != 0) 371 return (error == EWOULDBLOCK ? 0 : error); 372 } 373 374 if ((pflags & RT_L2_ME) != 0) { 375 update_mbuf_csumflags(m, m); 376 return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0)); 377 } 378 379 /* 380 * Add local infiniband header. If no space in first mbuf, 381 * allocate another. 382 */ 383 M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); 384 if (m == NULL) { 385 error = ENOBUFS; 386 goto bad; 387 } 388 if ((pflags & RT_HAS_HEADER) == 0) { 389 ih = mtod(m, struct infiniband_header *); 390 memcpy(ih, phdr, hlen); 391 } 392 393 /* 394 * Queue message on interface, update output statistics if 395 * successful, and start output if interface not yet active. 396 */ 397 return (ifp->if_transmit(ifp, m)); 398 bad: 399 if (m != NULL) 400 m_freem(m); 401 return (error); 402 } 403 404 /* 405 * Process a received Infiniband packet. 406 */ 407 static void 408 infiniband_input(struct ifnet *ifp, struct mbuf *m) 409 { 410 struct infiniband_header *ibh; 411 struct epoch_tracker et; 412 int isr; 413 414 CURVNET_SET_QUIET(ifp->if_vnet); 415 416 if ((ifp->if_flags & IFF_UP) == 0) { 417 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 418 m_freem(m); 419 goto done; 420 } 421 422 ibh = mtod(m, struct infiniband_header *); 423 424 /* 425 * Reset layer specific mbuf flags to avoid confusing upper 426 * layers: 427 */ 428 m->m_flags &= ~M_VLANTAG; 429 m_clrprotoflags(m); 430 431 if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { 432 if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, 433 ifp->if_addrlen) == 0) 434 m->m_flags |= M_BCAST; 435 else 436 m->m_flags |= M_MCAST; 437 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); 438 } 439 440 /* Let BPF have it before we strip the header. */ 441 INFINIBAND_BPF_MTAP(ifp, m); 442 443 /* Allow monitor mode to claim this frame, after stats are updated. */ 444 if (ifp->if_flags & IFF_MONITOR) { 445 m_freem(m); 446 goto done; 447 } 448 449 /* Direct packet to correct FIB based on interface config. */ 450 M_SETFIB(m, ifp->if_fib); 451 452 /* Handle input from a lagg<N> port */ 453 if (ifp->if_type == IFT_INFINIBANDLAG) { 454 KASSERT(lagg_input_infiniband_p != NULL, 455 ("%s: if_lagg not loaded!", __func__)); 456 m = (*lagg_input_infiniband_p)(ifp, m); 457 if (__predict_false(m == NULL)) 458 goto done; 459 ifp = m->m_pkthdr.rcvif; 460 } 461 462 /* 463 * Dispatch frame to upper layer. 464 */ 465 switch (ibh->ib_protocol) { 466 #ifdef INET 467 case htons(ETHERTYPE_IP): 468 isr = NETISR_IP; 469 break; 470 471 case htons(ETHERTYPE_ARP): 472 if (ifp->if_flags & IFF_NOARP) { 473 /* Discard packet if ARP is disabled on interface */ 474 m_freem(m); 475 goto done; 476 } 477 isr = NETISR_ARP; 478 break; 479 #endif 480 #ifdef INET6 481 case htons(ETHERTYPE_IPV6): 482 isr = NETISR_IPV6; 483 break; 484 #endif 485 default: 486 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 487 m_freem(m); 488 goto done; 489 } 490 491 /* Strip off the Infiniband header. */ 492 m_adj(m, INFINIBAND_HDR_LEN); 493 494 #ifdef MAC 495 /* 496 * Tag the mbuf with an appropriate MAC label before any other 497 * consumers can get to it. 498 */ 499 mac_ifnet_create_mbuf(ifp, m); 500 #endif 501 /* Allow monitor mode to claim this frame, after stats are updated. */ 502 NET_EPOCH_ENTER(et); 503 netisr_dispatch(isr, m); 504 NET_EPOCH_EXIT(et); 505 done: 506 CURVNET_RESTORE(); 507 } 508 509 static int 510 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, 511 struct sockaddr *sa) 512 { 513 struct sockaddr_dl *sdl; 514 #ifdef INET 515 struct sockaddr_in *sin; 516 #endif 517 #ifdef INET6 518 struct sockaddr_in6 *sin6; 519 #endif 520 uint8_t *e_addr; 521 522 switch (sa->sa_family) { 523 case AF_LINK: 524 /* 525 * No mapping needed. Just check that it's a valid MC address. 526 */ 527 sdl = (struct sockaddr_dl *)sa; 528 e_addr = LLADDR(sdl); 529 if (!INFINIBAND_IS_MULTICAST(e_addr)) 530 return (EADDRNOTAVAIL); 531 *llsa = NULL; 532 return 0; 533 534 #ifdef INET 535 case AF_INET: 536 sin = (struct sockaddr_in *)sa; 537 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 538 return (EADDRNOTAVAIL); 539 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 540 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 541 e_addr = LLADDR(sdl); 542 infiniband_ipv4_multicast_map( 543 sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); 544 *llsa = (struct sockaddr *)sdl; 545 return (0); 546 #endif 547 #ifdef INET6 548 case AF_INET6: 549 sin6 = (struct sockaddr_in6 *)sa; 550 /* 551 * An IP6 address of 0 means listen to all of the 552 * multicast address used for IP6. This has no meaning 553 * in infiniband. 554 */ 555 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 556 return (EADDRNOTAVAIL); 557 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) 558 return (EADDRNOTAVAIL); 559 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 560 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 561 e_addr = LLADDR(sdl); 562 infiniband_ipv6_multicast_map( 563 &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); 564 *llsa = (struct sockaddr *)sdl; 565 return (0); 566 #endif 567 default: 568 return (EAFNOSUPPORT); 569 } 570 } 571 572 void 573 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) 574 { 575 struct sockaddr_dl *sdl; 576 struct ifaddr *ifa; 577 int i; 578 579 ifp->if_addrlen = INFINIBAND_ADDR_LEN; 580 ifp->if_hdrlen = INFINIBAND_HDR_LEN; 581 ifp->if_mtu = INFINIBAND_MTU; 582 if_attach(ifp); 583 ifp->if_output = infiniband_output; 584 ifp->if_input = infiniband_input; 585 ifp->if_resolvemulti = infiniband_resolvemulti; 586 ifp->if_requestencap = infiniband_requestencap; 587 588 if (ifp->if_baudrate == 0) 589 ifp->if_baudrate = IF_Gbps(10); /* default value */ 590 if (llb != NULL) 591 ifp->if_broadcastaddr = llb; 592 593 ifa = ifp->if_addr; 594 KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); 595 sdl = (struct sockaddr_dl *)ifa->ifa_addr; 596 sdl->sdl_type = IFT_INFINIBAND; 597 sdl->sdl_alen = ifp->if_addrlen; 598 599 if (lla != NULL) { 600 memcpy(LLADDR(sdl), lla, ifp->if_addrlen); 601 602 if (ifp->if_hw_addr != NULL) 603 memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); 604 } else { 605 lla = LLADDR(sdl); 606 } 607 608 /* Attach ethernet compatible network device */ 609 bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); 610 611 /* Announce Infiniband MAC address if non-zero. */ 612 for (i = 0; i < ifp->if_addrlen; i++) 613 if (lla[i] != 0) 614 break; 615 if (i != ifp->if_addrlen) 616 if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); 617 618 /* Add necessary bits are setup; announce it now. */ 619 EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); 620 621 if (IS_DEFAULT_VNET(curvnet)) 622 devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); 623 } 624 625 /* 626 * Perform common duties while detaching an Infiniband interface 627 */ 628 void 629 infiniband_ifdetach(struct ifnet *ifp) 630 { 631 bpfdetach(ifp); 632 if_detach(ifp); 633 } 634 635 static int 636 infiniband_modevent(module_t mod, int type, void *data) 637 { 638 switch (type) { 639 case MOD_LOAD: 640 case MOD_UNLOAD: 641 return (0); 642 default: 643 return (EOPNOTSUPP); 644 } 645 } 646 647 static moduledata_t infiniband_mod = { 648 .name = "if_infiniband", 649 .evhand = &infiniband_modevent, 650 }; 651 652 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); 653 MODULE_VERSION(if_infiniband, 1); 654