/*- * Copyright (c) 2016 Alexander Motin * Copyright (C) 2013 Intel Corporation * Copyright (C) 2015 EMC Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The Non-Transparent Bridge (NTB) is a device that allows you to connect * two or more systems using a PCI-e links, providing remote memory access. * * This module contains a driver for simulated Ethernet device, using * underlying NTB Transport device. * * NOTE: Much of the code in this module is shared with Linux. Any patches may * be picked up and redistributed in Linux with a dual GPL/BSD license. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../ntb_transport.h" #define KTR_NTB KTR_SPARE3 #define NTB_MEDIATYPE (IFM_ETHER | IFM_AUTO | IFM_FDX) #define NTB_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP) #define NTB_CSUM_FEATURES6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6) #define NTB_CSUM_SET (CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \ CSUM_PSEUDO_HDR | \ CSUM_IP_CHECKED | CSUM_IP_VALID | \ CSUM_SCTP_VALID) static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "if_ntb"); static unsigned g_if_ntb_num_queues = UINT_MAX; SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN, &g_if_ntb_num_queues, 0, "Number of queues per interface"); struct ntb_net_queue { struct ntb_net_ctx *sc; if_t ifp; struct ntb_transport_qp *qp; struct buf_ring *br; struct task tx_task; struct taskqueue *tx_tq; struct mtx tx_lock; struct callout queue_full; }; struct ntb_net_ctx { if_t ifp; struct ifmedia media; u_char eaddr[ETHER_ADDR_LEN]; int num_queues; struct ntb_net_queue *queues; int mtu; }; static int ntb_net_probe(device_t dev); static int ntb_net_attach(device_t dev); static int ntb_net_detach(device_t dev); static void ntb_net_init(void *arg); static int ntb_ifmedia_upd(struct ifnet *); static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *); static int ntb_ioctl(if_t ifp, u_long command, caddr_t data); static int ntb_transmit(if_t ifp, struct mbuf *m); static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len); static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len); static void ntb_net_event_handler(void *data, enum ntb_link_event status); static void ntb_handle_tx(void *arg, int pending); static void ntb_qp_full(void *arg); static void ntb_qflush(if_t ifp); static void create_random_local_eui48(u_char *eaddr); static int ntb_net_probe(device_t dev) { device_set_desc(dev, "NTB Network Interface"); return (0); } static int ntb_net_attach(device_t dev) { struct ntb_net_ctx *sc = device_get_softc(dev); struct ntb_net_queue *q; if_t ifp; struct ntb_queue_handlers handlers = { ntb_net_rx_handler, ntb_net_tx_handler, ntb_net_event_handler }; int i; ifp = sc->ifp = if_gethandle(IFT_ETHER); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setdev(ifp, dev); sc->num_queues = min(g_if_ntb_num_queues, ntb_transport_queue_count(dev)); sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue), M_DEVBUF, M_WAITOK | M_ZERO); sc->mtu = INT_MAX; for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; q->sc = sc; q->ifp = ifp; q->qp = ntb_transport_create_queue(dev, i, &handlers, q); if (q->qp == NULL) break; sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp)); mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF); q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock); TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q); q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT, taskqueue_thread_enqueue, &q->tx_tq); taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d", device_get_nameunit(dev), i); callout_init(&q->queue_full, 1); } sc->num_queues = i; device_printf(dev, "%d queue(s)\n", sc->num_queues); if_setinitfn(ifp, ntb_net_init); if_setsoftc(ifp, sc); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, ntb_ioctl); if_settransmitfn(ifp, ntb_transmit); if_setqflushfn(ifp, ntb_qflush); create_random_local_eui48(sc->eaddr); ether_ifattach(ifp, sc->eaddr); if_setcapabilities(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_JUMBO_MTU | IFCAP_LINKSTATE); if_setcapenable(ifp, IFCAP_JUMBO_MTU | IFCAP_LINKSTATE); if_setmtu(ifp, sc->mtu - ETHER_HDR_LEN); ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd, ntb_ifmedia_sts); ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL); ifmedia_set(&sc->media, NTB_MEDIATYPE); for (i = 0; i < sc->num_queues; i++) ntb_transport_link_up(sc->queues[i].qp); return (0); } static int ntb_net_detach(device_t dev) { struct ntb_net_ctx *sc = device_get_softc(dev); struct ntb_net_queue *q; int i; for (i = 0; i < sc->num_queues; i++) ntb_transport_link_down(sc->queues[i].qp); ether_ifdetach(sc->ifp); if_free(sc->ifp); ifmedia_removeall(&sc->media); for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; ntb_transport_free_queue(q->qp); buf_ring_free(q->br, M_DEVBUF); callout_drain(&q->queue_full); taskqueue_drain_all(q->tx_tq); mtx_destroy(&q->tx_lock); } free(sc->queues, M_DEVBUF); return (0); } /* Network device interface */ static void ntb_net_init(void *arg) { struct ntb_net_ctx *sc = arg; if_t ifp = sc->ifp; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if_setbaudrate(ifp, ntb_transport_link_speed(sc->queues[0].qp)); if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ? LINK_STATE_UP : LINK_STATE_DOWN); } static int ntb_ioctl(if_t ifp, u_long command, caddr_t data) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (command) { case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMTU: { if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) { error = EINVAL; break; } if_setmtu(ifp, ifr->ifr_mtu); break; } case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->media, command); break; case SIOCSIFCAP: if (ifr->ifr_reqcap & IFCAP_RXCSUM) if_setcapenablebit(ifp, IFCAP_RXCSUM, 0); else if_setcapenablebit(ifp, 0, IFCAP_RXCSUM); if (ifr->ifr_reqcap & IFCAP_TXCSUM) { if_setcapenablebit(ifp, IFCAP_TXCSUM, 0); if_sethwassistbits(ifp, NTB_CSUM_FEATURES, 0); } else { if_setcapenablebit(ifp, 0, IFCAP_TXCSUM); if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES); } if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6) if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0); else if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6); if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) { if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0); if_sethwassistbits(ifp, NTB_CSUM_FEATURES6, 0); } else { if_setcapenablebit(ifp, 0, IFCAP_TXCSUM_IPV6); if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES6); } break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int ntb_ifmedia_upd(struct ifnet *ifp) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ifmedia *ifm = &sc->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); return (0); } static void ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct ntb_net_ctx *sc = if_getsoftc(ifp); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = NTB_MEDIATYPE; if (ntb_transport_link_query(sc->queues[0].qp)) ifmr->ifm_status |= IFM_ACTIVE; } static void ntb_transmit_locked(struct ntb_net_queue *q) { if_t ifp = q->ifp; struct mbuf *m; int rc, len; short mflags; CTR0(KTR_NTB, "TX: ntb_transmit_locked"); while ((m = drbr_peek(ifp, q->br)) != NULL) { CTR1(KTR_NTB, "TX: start mbuf %p", m); ether_bpf_mtap_if(ifp, m); len = m->m_pkthdr.len; mflags = m->m_flags; rc = ntb_transport_tx_enqueue(q->qp, m, m, len); if (rc != 0) { CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc); if (rc == EAGAIN) { drbr_putback(ifp, q->br, m); callout_reset_sbt(&q->queue_full, SBT_1MS / 4, SBT_1MS / 4, ntb_qp_full, q, 0); } else { m_freem(m); drbr_advance(ifp, q->br); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } break; } drbr_advance(ifp, q->br); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } } static int ntb_transmit(if_t ifp, struct mbuf *m) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ntb_net_queue *q; int error, i; CTR0(KTR_NTB, "TX: ntb_transmit"); if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % sc->num_queues; else i = curcpu % sc->num_queues; q = &sc->queues[i]; error = drbr_enqueue(ifp, q->br, m); if (error) return (error); if (mtx_trylock(&q->tx_lock)) { ntb_transmit_locked(q); mtx_unlock(&q->tx_lock); } else taskqueue_enqueue(q->tx_tq, &q->tx_task); return (0); } static void ntb_handle_tx(void *arg, int pending) { struct ntb_net_queue *q = arg; mtx_lock(&q->tx_lock); ntb_transmit_locked(q); mtx_unlock(&q->tx_lock); } static void ntb_qp_full(void *arg) { struct ntb_net_queue *q = arg; CTR0(KTR_NTB, "TX: qp_full callout"); if (ntb_transport_tx_free_entry(q->qp) > 0) taskqueue_enqueue(q->tx_tq, &q->tx_task); else callout_schedule_sbt(&q->queue_full, SBT_1MS / 4, SBT_1MS / 4, 0); } static void ntb_qflush(if_t ifp) { struct ntb_net_ctx *sc = if_getsoftc(ifp); struct ntb_net_queue *q; struct mbuf *m; int i; for (i = 0; i < sc->num_queues; i++) { q = &sc->queues[i]; mtx_lock(&q->tx_lock); while ((m = buf_ring_dequeue_sc(q->br)) != NULL) m_freem(m); mtx_unlock(&q->tx_lock); } if_qflush(ifp); } /* Network Device Callbacks */ static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len) { m_freem(data); CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data); } static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data, int len) { struct ntb_net_queue *q = qp_data; struct ntb_net_ctx *sc = q->sc; struct mbuf *m = data; if_t ifp = q->ifp; uint16_t proto; CTR1(KTR_NTB, "RX: rx handler (%d)", len); if (len < 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } m->m_pkthdr.rcvif = ifp; if (sc->num_queues > 1) { m->m_pkthdr.flowid = q - sc->queues; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { m_copydata(m, 12, 2, (void *)&proto); switch (ntohs(proto)) { case ETHERTYPE_IP: if (if_getcapenable(ifp) & IFCAP_RXCSUM) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = NTB_CSUM_SET; } break; case ETHERTYPE_IPV6: if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = NTB_CSUM_SET; } break; } } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_input(ifp, m); } static void ntb_net_event_handler(void *data, enum ntb_link_event status) { struct ntb_net_queue *q = data; if_setbaudrate(q->ifp, ntb_transport_link_speed(q->qp)); if_link_state_change(q->ifp, (status == NTB_LINK_UP) ? LINK_STATE_UP : LINK_STATE_DOWN); } /* Helper functions */ /* TODO: This too should really be part of the kernel */ #define EUI48_MULTICAST 1 << 0 #define EUI48_LOCALLY_ADMINISTERED 1 << 1 static void create_random_local_eui48(u_char *eaddr) { static uint8_t counter = 0; eaddr[0] = EUI48_LOCALLY_ADMINISTERED; arc4rand(&eaddr[1], 4, 0); eaddr[5] = counter++; } static device_method_t ntb_net_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ntb_net_probe), DEVMETHOD(device_attach, ntb_net_attach), DEVMETHOD(device_detach, ntb_net_detach), DEVMETHOD_END }; static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods, sizeof(struct ntb_net_ctx)); DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, NULL, NULL); MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1); MODULE_VERSION(if_ntb, 1);