/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> #include <sys/errno.h> #include <sys/param.h> #include <sys/callb.h> #include <sys/stream.h> #include <sys/kmem.h> #include <sys/conf.h> #include <sys/devops.h> #include <sys/ksynch.h> #include <sys/stat.h> #include <sys/modctl.h> #include <sys/modhash.h> #include <sys/debug.h> #include <sys/ethernet.h> #include <sys/dlpi.h> #include <net/if.h> #include <sys/mac_provider.h> #include <sys/mac_client.h> #include <sys/mac_client_priv.h> #include <sys/mac_ether.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/strsun.h> #include <sys/note.h> #include <sys/atomic.h> #include <sys/vnet.h> #include <sys/vlan.h> #include <sys/vnet_mailbox.h> #include <sys/vnet_common.h> #include <sys/dds.h> #include <sys/strsubr.h> #include <sys/taskq.h> /* * Function prototypes. */ /* DDI entrypoints */ static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int vnetattach(dev_info_t *, ddi_attach_cmd_t); static int vnetdetach(dev_info_t *, ddi_detach_cmd_t); /* MAC entrypoints */ static int vnet_m_stat(void *, uint_t, uint64_t *); static int vnet_m_start(void *); static void vnet_m_stop(void *); static int vnet_m_promisc(void *, boolean_t); static int vnet_m_multicst(void *, boolean_t, const uint8_t *); static int vnet_m_unicst(void *, const uint8_t *); mblk_t *vnet_m_tx(void *, mblk_t *); static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp); #ifdef VNET_IOC_DEBUG static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp); #endif static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data); static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index, const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle); static void vnet_get_group(void *arg, mac_ring_type_t type, const int index, mac_group_info_t *infop, mac_group_handle_t handle); static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num); static void vnet_rx_ring_stop(mac_ring_driver_t rdriver); static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val); static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num); static void vnet_tx_ring_stop(mac_ring_driver_t rdriver); static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val); static int vnet_ring_enable_intr(void *arg); static int vnet_ring_disable_intr(void *arg); static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup); static int vnet_addmac(void *arg, const uint8_t *mac_addr); static int vnet_remmac(void *arg, const uint8_t *mac_addr); /* vnet internal functions */ static int vnet_unattach(vnet_t *vnetp); static void vnet_ring_grp_init(vnet_t *vnetp); static void vnet_ring_grp_uninit(vnet_t *vnetp); static int vnet_mac_register(vnet_t *); static int vnet_read_mac_address(vnet_t *vnetp); static int vnet_bind_vgenring(vnet_res_t *vresp); static void vnet_unbind_vgenring(vnet_res_t *vresp); static int vnet_bind_hwrings(vnet_t *vnetp); static void vnet_unbind_hwrings(vnet_t *vnetp); static int vnet_bind_rings(vnet_res_t *vresp); static void vnet_unbind_rings(vnet_res_t *vresp); static int vnet_hio_stat(void *, uint_t, uint64_t *); static int vnet_hio_start(void *); static void vnet_hio_stop(void *); mblk_t *vnet_hio_tx(void *, mblk_t *); /* Forwarding database (FDB) routines */ static void vnet_fdb_create(vnet_t *vnetp); static void vnet_fdb_destroy(vnet_t *vnetp); static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp); static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val); void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp); static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp); static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp); static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp); static void vnet_tx_update(vio_net_handle_t vrh); static void vnet_res_start_task(void *arg); static void vnet_start_resources(vnet_t *vnetp); static void vnet_stop_resources(vnet_t *vnetp); static void vnet_dispatch_res_task(vnet_t *vnetp); static void vnet_res_start_task(void *arg); static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err); static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp); static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp); static void vnet_tx_notify_thread(void *); /* Exported to vnet_gen */ int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu); void vnet_link_update(vnet_t *vnetp, link_state_t link_state); void vnet_dds_cleanup_hio(vnet_t *vnetp); static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp); static int vnet_hio_update_kstats(kstat_t *ksp, int rw); static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp); static void vnet_hio_destroy_kstats(kstat_t *ksp); /* Exported to to vnet_dds */ int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg); int vnet_hio_mac_init(vnet_t *vnetp, char *ifname); void vnet_hio_mac_cleanup(vnet_t *vnetp); /* Externs that are imported from vnet_gen */ extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip, const uint8_t *macaddr, void **vgenhdl); extern int vgen_init_mdeg(void *arg); extern void vgen_uninit(void *arg); extern int vgen_dds_tx(void *arg, void *dmsg); extern int vgen_enable_intr(void *arg); extern int vgen_disable_intr(void *arg); extern mblk_t *vgen_rx_poll(void *arg, int bytes_to_pickup); /* Externs that are imported from vnet_dds */ extern void vdds_mod_init(void); extern void vdds_mod_fini(void); extern int vdds_init(vnet_t *vnetp); extern void vdds_cleanup(vnet_t *vnetp); extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg); extern void vdds_cleanup_hybrid_res(void *arg); extern void vdds_cleanup_hio(vnet_t *vnetp); extern pri_t minclsyspri; #define DRV_NAME "vnet" #define VNET_FDBE_REFHOLD(p) \ { \ atomic_inc_32(&(p)->refcnt); \ ASSERT((p)->refcnt != 0); \ } #define VNET_FDBE_REFRELE(p) \ { \ ASSERT((p)->refcnt != 0); \ atomic_dec_32(&(p)->refcnt); \ } #ifdef VNET_IOC_DEBUG #define VNET_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) #else #define VNET_M_CALLBACK_FLAGS (MC_GETCAPAB) #endif static mac_callbacks_t vnet_m_callbacks = { VNET_M_CALLBACK_FLAGS, vnet_m_stat, vnet_m_start, vnet_m_stop, vnet_m_promisc, vnet_m_multicst, NULL, /* m_unicst entry must be NULL while rx rings are exposed */ NULL, /* m_tx entry must be NULL while tx rings are exposed */ NULL, vnet_m_ioctl, vnet_m_capab, NULL }; static mac_callbacks_t vnet_hio_res_callbacks = { 0, vnet_hio_stat, vnet_hio_start, vnet_hio_stop, NULL, NULL, NULL, vnet_hio_tx, NULL, NULL, NULL }; /* * Linked list of "vnet_t" structures - one per instance. */ static vnet_t *vnet_headp = NULL; static krwlock_t vnet_rw; /* Tunables */ uint32_t vnet_num_descriptors = VNET_NUM_DESCRIPTORS; /* * Configure tx serialization in mac layer for the vnet device. This tunable * should be enabled to improve performance only if HybridIO is configured for * the vnet device. */ boolean_t vnet_mac_tx_serialize = B_FALSE; /* Configure enqueing at Rx soft rings in mac layer for the vnet device */ boolean_t vnet_mac_rx_queuing = B_TRUE; /* * Set this to non-zero to enable additional internal receive buffer pools * based on the MTU of the device for better performance at the cost of more * memory consumption. This is turned off by default, to use allocb(9F) for * receive buffer allocations of sizes > 2K. */ boolean_t vnet_jumbo_rxpools = B_FALSE; /* # of chains in fdb hash table */ uint32_t vnet_fdb_nchains = VNET_NFDB_HASH; /* Internal tunables */ uint32_t vnet_ethermtu = 1500; /* mtu of the device */ /* * Default vlan id. This is only used internally when the "default-vlan-id" * property is not present in the MD device node. Therefore, this should not be * used as a tunable; if this value is changed, the corresponding variable * should be updated to the same value in vsw and also other vnets connected to * the same vsw. */ uint16_t vnet_default_vlan_id = 1; /* delay in usec to wait for all references on a fdb entry to be dropped */ uint32_t vnet_fdbe_refcnt_delay = 10; static struct ether_addr etherbroadcastaddr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* mac_open() retry delay in usec */ uint32_t vnet_mac_open_delay = 100; /* 0.1 ms */ /* max # of mac_open() retries */ uint32_t vnet_mac_open_retries = 100; /* * Property names */ static char macaddr_propname[] = "local-mac-address"; /* * This is the string displayed by modinfo(1m). */ static char vnet_ident[] = "vnet driver"; extern struct mod_ops mod_driverops; static struct cb_ops cb_vnetops = { nulldev, /* cb_open */ nulldev, /* cb_close */ nodev, /* cb_strategy */ nodev, /* cb_print */ nodev, /* cb_dump */ nodev, /* cb_read */ nodev, /* cb_write */ nodev, /* cb_ioctl */ nodev, /* cb_devmap */ nodev, /* cb_mmap */ nodev, /* cb_segmap */ nochpoll, /* cb_chpoll */ ddi_prop_op, /* cb_prop_op */ NULL, /* cb_stream */ (int)(D_MP) /* cb_flag */ }; static struct dev_ops vnetops = { DEVO_REV, /* devo_rev */ 0, /* devo_refcnt */ NULL, /* devo_getinfo */ nulldev, /* devo_identify */ nulldev, /* devo_probe */ vnetattach, /* devo_attach */ vnetdetach, /* devo_detach */ nodev, /* devo_reset */ &cb_vnetops, /* devo_cb_ops */ (struct bus_ops *)NULL, /* devo_bus_ops */ NULL, /* devo_power */ ddi_quiesce_not_supported, /* devo_quiesce */ }; static struct modldrv modldrv = { &mod_driverops, /* Type of module. This one is a driver */ vnet_ident, /* ID string */ &vnetops /* driver specific ops */ }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modldrv, NULL }; #ifdef DEBUG #define DEBUG_PRINTF debug_printf /* * Print debug messages - set to 0xf to enable all msgs */ int vnet_dbglevel = 0x8; static void debug_printf(const char *fname, void *arg, const char *fmt, ...) { char buf[512]; va_list ap; vnet_t *vnetp = (vnet_t *)arg; char *bufp = buf; if (vnetp == NULL) { (void) sprintf(bufp, "%s: ", fname); bufp += strlen(bufp); } else { (void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname); bufp += strlen(bufp); } va_start(ap, fmt); (void) vsprintf(bufp, fmt, ap); va_end(ap); cmn_err(CE_CONT, "%s\n", buf); } #endif /* _init(9E): initialize the loadable module */ int _init(void) { int status; DBG1(NULL, "enter\n"); mac_init_ops(&vnetops, "vnet"); status = mod_install(&modlinkage); if (status != 0) { mac_fini_ops(&vnetops); } vdds_mod_init(); DBG1(NULL, "exit(%d)\n", status); return (status); } /* _fini(9E): prepare the module for unloading. */ int _fini(void) { int status; DBG1(NULL, "enter\n"); status = mod_remove(&modlinkage); if (status != 0) return (status); mac_fini_ops(&vnetops); vdds_mod_fini(); DBG1(NULL, "exit(%d)\n", status); return (status); } /* _info(9E): return information about the loadable module */ int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /* * attach(9E): attach a device to the system. * called once for each instance of the device on the system. */ static int vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) { vnet_t *vnetp; int status; int instance; uint64_t reg; char qname[TASKQ_NAMELEN]; vnet_attach_progress_t attach_progress; attach_progress = AST_init; switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: case DDI_PM_RESUME: default: goto vnet_attach_fail; } instance = ddi_get_instance(dip); DBG1(NULL, "instance(%d) enter\n", instance); /* allocate vnet_t and mac_t structures */ vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP); vnetp->dip = dip; vnetp->instance = instance; rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL); rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL); attach_progress |= AST_vnet_alloc; vnet_ring_grp_init(vnetp); attach_progress |= AST_ring_init; status = vdds_init(vnetp); if (status != 0) { goto vnet_attach_fail; } attach_progress |= AST_vdds_init; /* setup links to vnet_t from both devinfo and mac_t */ ddi_set_driver_private(dip, (caddr_t)vnetp); /* read the mac address */ status = vnet_read_mac_address(vnetp); if (status != DDI_SUCCESS) { goto vnet_attach_fail; } attach_progress |= AST_read_macaddr; reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", -1); if (reg == -1) { goto vnet_attach_fail; } vnetp->reg = reg; vnet_fdb_create(vnetp); attach_progress |= AST_fdbh_alloc; (void) snprintf(qname, TASKQ_NAMELEN, "vres_taskq%d", instance); if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1, TASKQ_DEFAULTPRI, 0)) == NULL) { cmn_err(CE_WARN, "!vnet%d: Unable to create task queue", instance); goto vnet_attach_fail; } attach_progress |= AST_taskq_create; /* add to the list of vnet devices */ WRITE_ENTER(&vnet_rw); vnetp->nextp = vnet_headp; vnet_headp = vnetp; RW_EXIT(&vnet_rw); attach_progress |= AST_vnet_list; /* * Initialize the generic vnet plugin which provides communication via * sun4v LDC (logical domain channel) based resources. This involves 2 * steps; first, vgen_init() is invoked to read the various properties * of the vnet device from its MD node (including its mtu which is * needed to mac_register()) and obtain a handle to the vgen layer. * After mac_register() is done and we have a mac handle, we then * invoke vgen_init_mdeg() which registers with the the MD event * generator (mdeg) framework to allow LDC resource notifications. * Note: this sequence also allows us to report the correct default # * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked * in the context of mac_register(); and avoids conflicting with * dynamic pseudo rx rings which get added/removed as a result of mdeg * events in vgen. */ status = vgen_init(vnetp, reg, vnetp->dip, (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl); if (status != DDI_SUCCESS) { DERR(vnetp, "vgen_init() failed\n"); goto vnet_attach_fail; } attach_progress |= AST_vgen_init; status = vnet_mac_register(vnetp); if (status != DDI_SUCCESS) { goto vnet_attach_fail; } vnetp->link_state = LINK_STATE_UNKNOWN; attach_progress |= AST_macreg; status = vgen_init_mdeg(vnetp->vgenhdl); if (status != DDI_SUCCESS) { goto vnet_attach_fail; } attach_progress |= AST_init_mdeg; vnetp->attach_progress = attach_progress; DBG1(NULL, "instance(%d) exit\n", instance); return (DDI_SUCCESS); vnet_attach_fail: vnetp->attach_progress = attach_progress; status = vnet_unattach(vnetp); ASSERT(status == 0); return (DDI_FAILURE); } /* * detach(9E): detach a device from the system. */ static int vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) { vnet_t *vnetp; int instance; instance = ddi_get_instance(dip); DBG1(NULL, "instance(%d) enter\n", instance); vnetp = ddi_get_driver_private(dip); if (vnetp == NULL) { goto vnet_detach_fail; } switch (cmd) { case DDI_DETACH: break; case DDI_SUSPEND: case DDI_PM_SUSPEND: default: goto vnet_detach_fail; } if (vnet_unattach(vnetp) != 0) { goto vnet_detach_fail; } return (DDI_SUCCESS); vnet_detach_fail: return (DDI_FAILURE); } /* * Common routine to handle vnetattach() failure and vnetdetach(). Note that * the only reason this function could fail is if mac_unregister() fails. * Otherwise, this function must ensure that all resources are freed and return * success. */ static int vnet_unattach(vnet_t *vnetp) { vnet_attach_progress_t attach_progress; attach_progress = vnetp->attach_progress; /* * Disable the mac device in the gldv3 subsystem. This can fail, in * particular if there are still any open references to this mac * device; in which case we just return failure without continuing to * detach further. * If it succeeds, we then invoke vgen_uninit() which should unregister * any pseudo rings registered with the mac layer. Note we keep the * AST_macreg flag on, so we can unregister with the mac layer at * the end of this routine. */ if (attach_progress & AST_macreg) { if (mac_disable(vnetp->mh) != 0) { return (1); } } /* * Now that we have disabled the device, we must finish all other steps * and successfully return from this function; otherwise we will end up * leaving the device in a broken/unusable state. * * First, release any hybrid resources assigned to this vnet device. */ if (attach_progress & AST_vdds_init) { vdds_cleanup(vnetp); attach_progress &= ~AST_vdds_init; } /* * Uninit vgen. This stops further mdeg callbacks to this vnet * device and/or its ports; and detaches any existing ports. */ if (attach_progress & (AST_vgen_init|AST_init_mdeg)) { vgen_uninit(vnetp->vgenhdl); attach_progress &= ~AST_vgen_init; attach_progress &= ~AST_init_mdeg; } /* Destroy the taskq. */ if (attach_progress & AST_taskq_create) { ddi_taskq_destroy(vnetp->taskqp); attach_progress &= ~AST_taskq_create; } /* Destroy fdb. */ if (attach_progress & AST_fdbh_alloc) { vnet_fdb_destroy(vnetp); attach_progress &= ~AST_fdbh_alloc; } /* Remove from the device list */ if (attach_progress & AST_vnet_list) { vnet_t **vnetpp; /* unlink from instance(vnet_t) list */ WRITE_ENTER(&vnet_rw); for (vnetpp = &vnet_headp; *vnetpp; vnetpp = &(*vnetpp)->nextp) { if (*vnetpp == vnetp) { *vnetpp = vnetp->nextp; break; } } RW_EXIT(&vnet_rw); attach_progress &= ~AST_vnet_list; } if (attach_progress & AST_ring_init) { vnet_ring_grp_uninit(vnetp); attach_progress &= ~AST_ring_init; } if (attach_progress & AST_macreg) { VERIFY(mac_unregister(vnetp->mh) == 0); vnetp->mh = NULL; attach_progress &= ~AST_macreg; } if (attach_progress & AST_vnet_alloc) { rw_destroy(&vnetp->vrwlock); rw_destroy(&vnetp->vsw_fp_rw); attach_progress &= ~AST_vnet_list; KMEM_FREE(vnetp); } return (0); } /* enable the device for transmit/receive */ static int vnet_m_start(void *arg) { vnet_t *vnetp = arg; DBG1(vnetp, "enter\n"); WRITE_ENTER(&vnetp->vrwlock); vnetp->flags |= VNET_STARTED; vnet_start_resources(vnetp); RW_EXIT(&vnetp->vrwlock); DBG1(vnetp, "exit\n"); return (VNET_SUCCESS); } /* stop transmit/receive for the device */ static void vnet_m_stop(void *arg) { vnet_t *vnetp = arg; DBG1(vnetp, "enter\n"); WRITE_ENTER(&vnetp->vrwlock); if (vnetp->flags & VNET_STARTED) { /* * Set the flags appropriately; this should prevent starting of * any new resources that are added(see vnet_res_start_task()), * while we release the vrwlock in vnet_stop_resources() before * stopping each resource. */ vnetp->flags &= ~VNET_STARTED; vnetp->flags |= VNET_STOPPING; vnet_stop_resources(vnetp); vnetp->flags &= ~VNET_STOPPING; } RW_EXIT(&vnetp->vrwlock); DBG1(vnetp, "exit\n"); } /* set the unicast mac address of the device */ static int vnet_m_unicst(void *arg, const uint8_t *macaddr) { _NOTE(ARGUNUSED(macaddr)) vnet_t *vnetp = arg; DBG1(vnetp, "enter\n"); /* * NOTE: setting mac address dynamically is not supported. */ DBG1(vnetp, "exit\n"); return (VNET_FAILURE); } /* enable/disable a multicast address */ static int vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca) { _NOTE(ARGUNUSED(add, mca)) vnet_t *vnetp = arg; vnet_res_t *vresp; mac_register_t *macp; mac_callbacks_t *cbp; int rv = VNET_SUCCESS; DBG1(vnetp, "enter\n"); READ_ENTER(&vnetp->vsw_fp_rw); if (vnetp->vsw_fp == NULL) { RW_EXIT(&vnetp->vsw_fp_rw); return (EAGAIN); } VNET_FDBE_REFHOLD(vnetp->vsw_fp); RW_EXIT(&vnetp->vsw_fp_rw); vresp = vnetp->vsw_fp; macp = &vresp->macreg; cbp = macp->m_callbacks; rv = cbp->mc_multicst(macp->m_driver, add, mca); VNET_FDBE_REFRELE(vnetp->vsw_fp); DBG1(vnetp, "exit(%d)\n", rv); return (rv); } /* set or clear promiscuous mode on the device */ static int vnet_m_promisc(void *arg, boolean_t on) { _NOTE(ARGUNUSED(on)) vnet_t *vnetp = arg; DBG1(vnetp, "enter\n"); /* * NOTE: setting promiscuous mode is not supported, just return success. */ DBG1(vnetp, "exit\n"); return (VNET_SUCCESS); } /* * Transmit a chain of packets. This function provides switching functionality * based on the destination mac address to reach other guests (within ldoms) or * external hosts. */ mblk_t * vnet_tx_ring_send(void *arg, mblk_t *mp) { vnet_pseudo_tx_ring_t *tx_ringp; vnet_tx_ring_stats_t *statsp; vnet_t *vnetp; vnet_res_t *vresp; mblk_t *next; mblk_t *resid_mp; mac_register_t *macp; struct ether_header *ehp; boolean_t is_unicast; boolean_t is_pvid; /* non-default pvid ? */ boolean_t hres; /* Hybrid resource ? */ void *tx_arg; size_t size; tx_ringp = (vnet_pseudo_tx_ring_t *)arg; statsp = &tx_ringp->tx_ring_stats; vnetp = (vnet_t *)tx_ringp->vnetp; DBG1(vnetp, "enter\n"); ASSERT(mp != NULL); is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE; while (mp != NULL) { next = mp->b_next; mp->b_next = NULL; /* update stats */ size = msgsize(mp); /* * Find fdb entry for the destination * and hold a reference to it. */ ehp = (struct ether_header *)mp->b_rptr; vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost); if (vresp != NULL) { /* * Destination found in FDB. * The destination is a vnet device within ldoms * and directly reachable, invoke the tx function * in the fdb entry. */ macp = &vresp->macreg; resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp); /* tx done; now release ref on fdb entry */ VNET_FDBE_REFRELE(vresp); if (resid_mp != NULL) { /* m_tx failed */ mp->b_next = next; break; } } else { is_unicast = !(IS_BROADCAST(ehp) || (IS_MULTICAST(ehp))); /* * Destination is not in FDB. * If the destination is broadcast or multicast, * then forward the packet to vswitch. * If a Hybrid resource avilable, then send the * unicast packet via hybrid resource, otherwise * forward it to vswitch. */ READ_ENTER(&vnetp->vsw_fp_rw); if ((is_unicast) && (vnetp->hio_fp != NULL)) { vresp = vnetp->hio_fp; hres = B_TRUE; } else { vresp = vnetp->vsw_fp; hres = B_FALSE; } if (vresp == NULL) { /* * no fdb entry to vsw? drop the packet. */ RW_EXIT(&vnetp->vsw_fp_rw); freemsg(mp); mp = next; continue; } /* ref hold the fdb entry to vsw */ VNET_FDBE_REFHOLD(vresp); RW_EXIT(&vnetp->vsw_fp_rw); /* * In the case of a hybrid resource we need to insert * the tag for the pvid case here; unlike packets that * are destined to a vnet/vsw in which case the vgen * layer does the tagging before sending it over ldc. */ if (hres == B_TRUE) { /* * Determine if the frame being transmitted * over the hybrid resource is untagged. If so, * insert the tag before transmitting. */ if (is_pvid == B_TRUE && ehp->ether_type != htons(ETHERTYPE_VLAN)) { mp = vnet_vlan_insert_tag(mp, vnetp->pvid); if (mp == NULL) { VNET_FDBE_REFRELE(vresp); mp = next; continue; } } macp = &vresp->macreg; tx_arg = tx_ringp; } else { macp = &vresp->macreg; tx_arg = macp->m_driver; } resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp); /* tx done; now release ref on fdb entry */ VNET_FDBE_REFRELE(vresp); if (resid_mp != NULL) { /* m_tx failed */ mp->b_next = next; break; } } statsp->obytes += size; statsp->opackets++; mp = next; } DBG1(vnetp, "exit\n"); return (mp); } /* get statistics from the device */ int vnet_m_stat(void *arg, uint_t stat, uint64_t *val) { vnet_t *vnetp = arg; vnet_res_t *vresp; mac_register_t *macp; mac_callbacks_t *cbp; uint64_t val_total = 0; DBG1(vnetp, "enter\n"); /* * get the specified statistic from each transport and return the * aggregate val. This obviously only works for counters. */ if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) || (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) { return (ENOTSUP); } READ_ENTER(&vnetp->vrwlock); for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) { macp = &vresp->macreg; cbp = macp->m_callbacks; if (cbp->mc_getstat(macp->m_driver, stat, val) == 0) val_total += *val; } RW_EXIT(&vnetp->vrwlock); *val = val_total; DBG1(vnetp, "exit\n"); return (0); } static void vnet_ring_grp_init(vnet_t *vnetp) { vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; vnet_pseudo_tx_group_t *tx_grp; vnet_pseudo_tx_ring_t *tx_ringp; int i; tx_grp = &vnetp->tx_grp[0]; tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) * VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP); for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) { tx_ringp[i].state |= VNET_TXRING_SHARED; } tx_grp->rings = tx_ringp; tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS; mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL); tx_grp->flowctl_thread = thread_create(NULL, 0, vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri); rx_grp = &vnetp->rx_grp[0]; rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP; rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL); rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) * rx_grp->max_ring_cnt, KM_SLEEP); /* * Setup the first 3 Pseudo RX Rings that are reserved; * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource. */ rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE; rx_ringp[0].index = 0; rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID; rx_ringp[1].index = 1; rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID; rx_ringp[2].index = 2; rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; rx_grp->rings = rx_ringp; for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; i < rx_grp->max_ring_cnt; i++) { rx_ringp = &rx_grp->rings[i]; rx_ringp->state = VNET_RXRING_FREE; rx_ringp->index = i; } } static void vnet_ring_grp_uninit(vnet_t *vnetp) { vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_tx_group_t *tx_grp; kt_did_t tid = 0; tx_grp = &vnetp->tx_grp[0]; /* Inform tx_notify_thread to exit */ mutex_enter(&tx_grp->flowctl_lock); if (tx_grp->flowctl_thread != NULL) { tid = tx_grp->flowctl_thread->t_did; tx_grp->flowctl_done = B_TRUE; cv_signal(&tx_grp->flowctl_cv); } mutex_exit(&tx_grp->flowctl_lock); if (tid != 0) thread_join(tid); if (tx_grp->rings != NULL) { ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS); kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) * tx_grp->ring_cnt); tx_grp->rings = NULL; } rx_grp = &vnetp->rx_grp[0]; if (rx_grp->rings != NULL) { ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP); ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT); kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) * rx_grp->max_ring_cnt); rx_grp->rings = NULL; } } static vnet_pseudo_rx_ring_t * vnet_alloc_pseudo_rx_ring(vnet_t *vnetp) { vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; int index; rx_grp = &vnetp->rx_grp[0]; WRITE_ENTER(&rx_grp->lock); if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) { /* no rings available */ RW_EXIT(&rx_grp->lock); return (NULL); } for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; index < rx_grp->max_ring_cnt; index++) { rx_ringp = &rx_grp->rings[index]; if (rx_ringp->state == VNET_RXRING_FREE) { rx_ringp->state |= VNET_RXRING_INUSE; rx_grp->ring_cnt++; break; } } RW_EXIT(&rx_grp->lock); return (rx_ringp); } static void vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp) { vnet_pseudo_rx_group_t *rx_grp; ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT); rx_grp = &vnetp->rx_grp[0]; WRITE_ENTER(&rx_grp->lock); if (ringp->state != VNET_RXRING_FREE) { ringp->state = VNET_RXRING_FREE; ringp->handle = NULL; rx_grp->ring_cnt--; } RW_EXIT(&rx_grp->lock); } /* wrapper function for mac_register() */ static int vnet_mac_register(vnet_t *vnetp) { mac_register_t *macp; int err; if ((macp = mac_alloc(MAC_VERSION)) == NULL) return (DDI_FAILURE); macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; macp->m_driver = vnetp; macp->m_dip = vnetp->dip; macp->m_src_addr = vnetp->curr_macaddr; macp->m_callbacks = &vnet_m_callbacks; macp->m_min_sdu = 0; macp->m_max_sdu = vnetp->mtu; macp->m_margin = VLAN_TAGSZ; macp->m_v12n = MAC_VIRT_LEVEL1; /* * Finally, we're ready to register ourselves with the MAC layer * interface; if this succeeds, we're all ready to start() */ err = mac_register(macp, &vnetp->mh); mac_free(macp); return (err == 0 ? DDI_SUCCESS : DDI_FAILURE); } /* read the mac address of the device */ static int vnet_read_mac_address(vnet_t *vnetp) { uchar_t *macaddr; uint32_t size; int rv; rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip, DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size); if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) { DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n", macaddr_propname, rv); return (DDI_FAILURE); } bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL); bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL); ddi_prop_free(macaddr); return (DDI_SUCCESS); } static void vnet_fdb_create(vnet_t *vnetp) { char hashname[MAXNAMELEN]; (void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash", vnetp->instance); vnetp->fdb_nchains = vnet_fdb_nchains; vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains, mod_hash_null_valdtor, sizeof (void *)); } static void vnet_fdb_destroy(vnet_t *vnetp) { /* destroy fdb-hash-table */ if (vnetp->fdb_hashp != NULL) { mod_hash_destroy_hash(vnetp->fdb_hashp); vnetp->fdb_hashp = NULL; vnetp->fdb_nchains = 0; } } /* * Add an entry into the fdb. */ void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp) { uint64_t addr = 0; int rv; KEY_HASH(addr, vresp->rem_macaddr); /* * If the entry being added corresponds to LDC_SERVICE resource, * that is, vswitch connection, it is added to the hash and also * the entry is cached, an additional reference count reflects * this. The HYBRID resource is not added to the hash, but only * cached, as it is only used for sending out packets for unknown * unicast destinations. */ (vresp->type == VIO_NET_RES_LDC_SERVICE) ? (vresp->refcnt = 1) : (vresp->refcnt = 0); /* * Note: duplicate keys will be rejected by mod_hash. */ if (vresp->type != VIO_NET_RES_HYBRID) { rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr, (mod_hash_val_t)vresp); if (rv != 0) { DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr); return; } } if (vresp->type == VIO_NET_RES_LDC_SERVICE) { /* Cache the fdb entry to vsw-port */ WRITE_ENTER(&vnetp->vsw_fp_rw); if (vnetp->vsw_fp == NULL) vnetp->vsw_fp = vresp; RW_EXIT(&vnetp->vsw_fp_rw); } else if (vresp->type == VIO_NET_RES_HYBRID) { /* Cache the fdb entry to hybrid resource */ WRITE_ENTER(&vnetp->vsw_fp_rw); if (vnetp->hio_fp == NULL) vnetp->hio_fp = vresp; RW_EXIT(&vnetp->vsw_fp_rw); } } /* * Remove an entry from fdb. */ static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp) { uint64_t addr = 0; int rv; uint32_t refcnt; vnet_res_t *tmp; KEY_HASH(addr, vresp->rem_macaddr); /* * Remove the entry from fdb hash table. * This prevents further references to this fdb entry. */ if (vresp->type != VIO_NET_RES_HYBRID) { rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr, (mod_hash_val_t *)&tmp); if (rv != 0) { /* * As the resources are added to the hash only * after they are started, this can occur if * a resource unregisters before it is ever started. */ return; } } if (vresp->type == VIO_NET_RES_LDC_SERVICE) { WRITE_ENTER(&vnetp->vsw_fp_rw); ASSERT(tmp == vnetp->vsw_fp); vnetp->vsw_fp = NULL; RW_EXIT(&vnetp->vsw_fp_rw); } else if (vresp->type == VIO_NET_RES_HYBRID) { WRITE_ENTER(&vnetp->vsw_fp_rw); vnetp->hio_fp = NULL; RW_EXIT(&vnetp->vsw_fp_rw); } /* * If there are threads already ref holding before the entry was * removed from hash table, then wait for ref count to drop to zero. */ (vresp->type == VIO_NET_RES_LDC_SERVICE) ? (refcnt = 1) : (refcnt = 0); while (vresp->refcnt > refcnt) { delay(drv_usectohz(vnet_fdbe_refcnt_delay)); } } /* * Search fdb for a given mac address. If an entry is found, hold * a reference to it and return the entry; else returns NULL. */ static vnet_res_t * vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp) { uint64_t key = 0; vnet_res_t *vresp; int rv; KEY_HASH(key, addrp->ether_addr_octet); rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key, (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb); if (rv != 0) return (NULL); return (vresp); } /* * Callback function provided to mod_hash_find_cb(). After finding the fdb * entry corresponding to the key (macaddr), this callback will be invoked by * mod_hash_find_cb() to atomically increment the reference count on the fdb * entry before returning the found entry. */ static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val) { _NOTE(ARGUNUSED(key)) VNET_FDBE_REFHOLD((vnet_res_t *)val); } /* * Frames received that are tagged with the pvid of the vnet device must be * untagged before sending up the stack. This function walks the chain of rx * frames, untags any such frames and returns the updated chain. * * Arguments: * pvid: pvid of the vnet device for which packets are being received * mp: head of pkt chain to be validated and untagged * * Returns: * mp: head of updated chain of packets */ static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp) { struct ether_vlan_header *evhp; mblk_t *bp; mblk_t *bpt; mblk_t *bph; mblk_t *bpn; bpn = bph = bpt = NULL; for (bp = *mp; bp != NULL; bp = bpn) { bpn = bp->b_next; bp->b_next = bp->b_prev = NULL; evhp = (struct ether_vlan_header *)bp->b_rptr; if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN && VLAN_ID(ntohs(evhp->ether_tci)) == pvid) { bp = vnet_vlan_remove_tag(bp); if (bp == NULL) { continue; } } /* build a chain of processed packets */ if (bph == NULL) { bph = bpt = bp; } else { bpt->b_next = bp; bpt = bp; } } *mp = bph; } static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp) { vnet_res_t *vresp = (vnet_res_t *)vrh; vnet_t *vnetp = vresp->vnetp; vnet_pseudo_rx_ring_t *ringp; if ((vnetp == NULL) || (vnetp->mh == 0)) { freemsgchain(mp); return; } ringp = vresp->rx_ringp; mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num); } void vnet_tx_update(vio_net_handle_t vrh) { vnet_res_t *vresp = (vnet_res_t *)vrh; vnet_t *vnetp = vresp->vnetp; vnet_pseudo_tx_ring_t *tx_ringp; vnet_pseudo_tx_group_t *tx_grp; int i; if (vnetp == NULL || vnetp->mh == NULL) { return; } /* * Currently, the tx hwring API (used to access rings that belong to * a Hybrid IO resource) does not provide us a per ring flow ctrl * update; also the pseudo rings are shared by the ports/ldcs in the * vgen layer. Thus we can't figure out which pseudo ring is being * re-enabled for transmits. To work around this, when we get a tx * restart notification from below, we simply propagate that to all * the tx pseudo rings registered with the mac layer above. * * There are a couple of side effects with this approach, but they are * not harmful, as outlined below: * * A) We might send an invalid ring_update() for a ring that is not * really flow controlled. This will not have any effect in the mac * layer and packets will continue to be transmitted on that ring. * * B) We might end up clearing the flow control in the mac layer for * a ring that is still flow controlled in the underlying resource. * This will result in the mac layer restarting transmit, only to be * flow controlled again on that ring. */ tx_grp = &vnetp->tx_grp[0]; for (i = 0; i < tx_grp->ring_cnt; i++) { tx_ringp = &tx_grp->rings[i]; mac_tx_ring_update(vnetp->mh, tx_ringp->handle); } } /* * vnet_tx_notify_thread: * * vnet_tx_ring_update() callback function wakes up this thread when * it gets called. This thread will call mac_tx_ring_update() to * notify upper mac of flow control getting relieved. Note that * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly * because vnet_tx_ring_update() is called from lower mac with * mi_rw_lock held and mac_tx_ring_update() would also try to grab * the same lock. */ static void vnet_tx_notify_thread(void *arg) { callb_cpr_t cprinfo; vnet_pseudo_tx_group_t *tx_grp = (vnet_pseudo_tx_group_t *)arg; vnet_pseudo_tx_ring_t *tx_ringp; vnet_t *vnetp; int i; CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr, "vnet_tx_notify_thread"); mutex_enter(&tx_grp->flowctl_lock); while (!tx_grp->flowctl_done) { CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock); CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock); for (i = 0; i < tx_grp->ring_cnt; i++) { tx_ringp = &tx_grp->rings[i]; if (tx_ringp->woken_up) { tx_ringp->woken_up = B_FALSE; vnetp = tx_ringp->vnetp; mac_tx_ring_update(vnetp->mh, tx_ringp->handle); } } } /* * The tx_grp is being destroyed, exit the thread. */ tx_grp->flowctl_thread = NULL; CALLB_CPR_EXIT(&cprinfo); thread_exit(); } void vnet_tx_ring_update(void *arg1, uintptr_t arg2) { vnet_t *vnetp = (vnet_t *)arg1; vnet_pseudo_tx_group_t *tx_grp; vnet_pseudo_tx_ring_t *tx_ringp; int i; tx_grp = &vnetp->tx_grp[0]; for (i = 0; i < tx_grp->ring_cnt; i++) { tx_ringp = &tx_grp->rings[i]; if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) { mutex_enter(&tx_grp->flowctl_lock); tx_ringp->woken_up = B_TRUE; cv_signal(&tx_grp->flowctl_cv); mutex_exit(&tx_grp->flowctl_lock); break; } } } /* * Update the new mtu of vnet into the mac layer. First check if the device has * been plumbed and if so fail the mtu update. Returns 0 on success. */ int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu) { int rv; if (vnetp == NULL || vnetp->mh == NULL) { return (EINVAL); } WRITE_ENTER(&vnetp->vrwlock); if (vnetp->flags & VNET_STARTED) { RW_EXIT(&vnetp->vrwlock); cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu " "update as the device is plumbed\n", vnetp->instance); return (EBUSY); } /* update mtu in the mac layer */ rv = mac_maxsdu_update(vnetp->mh, mtu); if (rv != 0) { RW_EXIT(&vnetp->vrwlock); cmn_err(CE_NOTE, "!vnet%d: Unable to update mtu with mac layer\n", vnetp->instance); return (EIO); } vnetp->mtu = mtu; RW_EXIT(&vnetp->vrwlock); return (0); } /* * Update the link state of vnet to the mac layer. */ void vnet_link_update(vnet_t *vnetp, link_state_t link_state) { if (vnetp == NULL || vnetp->mh == NULL) { return; } WRITE_ENTER(&vnetp->vrwlock); if (vnetp->link_state == link_state) { RW_EXIT(&vnetp->vrwlock); return; } vnetp->link_state = link_state; RW_EXIT(&vnetp->vrwlock); mac_link_update(vnetp->mh, link_state); } /* * vio_net_resource_reg -- An interface called to register a resource * with vnet. * macp -- a GLDv3 mac_register that has all the details of * a resource and its callbacks etc. * type -- resource type. * local_macaddr -- resource's MAC address. This is used to * associate a resource with a corresponding vnet. * remote_macaddr -- remote side MAC address. This is ignored for * the Hybrid resources. * vhp -- A handle returned to the caller. * vcb -- A set of callbacks provided to the callers. */ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type, ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp, vio_net_callbacks_t *vcb) { vnet_t *vnetp; vnet_res_t *vresp; vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP); ether_copy(local_macaddr, vresp->local_macaddr); ether_copy(rem_macaddr, vresp->rem_macaddr); vresp->type = type; bcopy(macp, &vresp->macreg, sizeof (mac_register_t)); DBG1(NULL, "Resource Registerig type=0%X\n", type); READ_ENTER(&vnet_rw); vnetp = vnet_headp; while (vnetp != NULL) { if (VNET_MATCH_RES(vresp, vnetp)) { vresp->vnetp = vnetp; /* Setup kstats for hio resource */ if (vresp->type == VIO_NET_RES_HYBRID) { vresp->ksp = vnet_hio_setup_kstats(DRV_NAME, "hio", vresp); if (vresp->ksp == NULL) { cmn_err(CE_NOTE, "!vnet%d: Cannot " "create kstats for hio resource", vnetp->instance); } } vnet_add_resource(vnetp, vresp); break; } vnetp = vnetp->nextp; } RW_EXIT(&vnet_rw); if (vresp->vnetp == NULL) { DWARN(NULL, "No vnet instance"); kmem_free(vresp, sizeof (vnet_res_t)); return (ENXIO); } *vhp = vresp; vcb->vio_net_rx_cb = vnet_rx; vcb->vio_net_tx_update = vnet_tx_update; vcb->vio_net_report_err = vnet_handle_res_err; /* Bind the resource to pseudo ring(s) */ if (vnet_bind_rings(vresp) != 0) { (void) vnet_rem_resource(vnetp, vresp); vnet_hio_destroy_kstats(vresp->ksp); KMEM_FREE(vresp); return (1); } /* Dispatch a task to start resources */ vnet_dispatch_res_task(vnetp); return (0); } /* * vio_net_resource_unreg -- An interface to unregister a resource. */ void vio_net_resource_unreg(vio_net_handle_t vhp) { vnet_res_t *vresp = (vnet_res_t *)vhp; vnet_t *vnetp = vresp->vnetp; DBG1(NULL, "Resource Registerig hdl=0x%p", vhp); ASSERT(vnetp != NULL); /* * Remove the resource from fdb; this ensures * there are no references to the resource. */ vnet_fdbe_del(vnetp, vresp); vnet_unbind_rings(vresp); /* Now remove the resource from the list */ (void) vnet_rem_resource(vnetp, vresp); vnet_hio_destroy_kstats(vresp->ksp); KMEM_FREE(vresp); } static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp) { WRITE_ENTER(&vnetp->vrwlock); vresp->nextp = vnetp->vres_list; vnetp->vres_list = vresp; RW_EXIT(&vnetp->vrwlock); } static vnet_res_t * vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp) { vnet_res_t *vrp; WRITE_ENTER(&vnetp->vrwlock); if (vresp == vnetp->vres_list) { vnetp->vres_list = vresp->nextp; } else { vrp = vnetp->vres_list; while (vrp->nextp != NULL) { if (vrp->nextp == vresp) { vrp->nextp = vresp->nextp; break; } vrp = vrp->nextp; } } vresp->vnetp = NULL; vresp->nextp = NULL; RW_EXIT(&vnetp->vrwlock); return (vresp); } /* * vnet_dds_rx -- an interface called by vgen to DDS messages. */ void vnet_dds_rx(void *arg, void *dmsg) { vnet_t *vnetp = arg; vdds_process_dds_msg(vnetp, dmsg); } /* * vnet_send_dds_msg -- An interface provided to DDS to send * DDS messages. This simply sends meessages via vgen. */ int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg) { int rv; if (vnetp->vgenhdl != NULL) { rv = vgen_dds_tx(vnetp->vgenhdl, dmsg); } return (rv); } /* * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources. */ void vnet_dds_cleanup_hio(vnet_t *vnetp) { vdds_cleanup_hio(vnetp); } /* * vnet_handle_res_err -- A callback function called by a resource * to report an error. For example, vgen can call to report * an LDC down/reset event. This will trigger cleanup of associated * Hybrid resource. */ /* ARGSUSED */ static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err) { vnet_res_t *vresp = (vnet_res_t *)vrh; vnet_t *vnetp = vresp->vnetp; if (vnetp == NULL) { return; } if ((vresp->type != VIO_NET_RES_LDC_SERVICE) && (vresp->type != VIO_NET_RES_HYBRID)) { return; } vdds_cleanup_hio(vnetp); } /* * vnet_dispatch_res_task -- A function to dispatch tasks start resources. */ static void vnet_dispatch_res_task(vnet_t *vnetp) { int rv; /* * Dispatch the task. It could be the case that vnetp->flags does * not have VNET_STARTED set. This is ok as vnet_rest_start_task() * can abort the task when the task is started. See related comments * in vnet_m_stop() and vnet_stop_resources(). */ rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task, vnetp, DDI_NOSLEEP); if (rv != DDI_SUCCESS) { cmn_err(CE_WARN, "vnet%d:Can't dispatch start resource task", vnetp->instance); } } /* * vnet_res_start_task -- A taskq callback function that starts a resource. */ static void vnet_res_start_task(void *arg) { vnet_t *vnetp = arg; WRITE_ENTER(&vnetp->vrwlock); if (vnetp->flags & VNET_STARTED) { vnet_start_resources(vnetp); } RW_EXIT(&vnetp->vrwlock); } /* * vnet_start_resources -- starts all resources associated with * a vnet. */ static void vnet_start_resources(vnet_t *vnetp) { mac_register_t *macp; mac_callbacks_t *cbp; vnet_res_t *vresp; int rv; DBG1(vnetp, "enter\n"); ASSERT(RW_WRITE_HELD(&vnetp->vrwlock)); for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) { /* skip if it is already started */ if (vresp->flags & VNET_STARTED) { continue; } macp = &vresp->macreg; cbp = macp->m_callbacks; rv = cbp->mc_start(macp->m_driver); if (rv == 0) { /* * Successfully started the resource, so now * add it to the fdb. */ vresp->flags |= VNET_STARTED; vnet_fdbe_add(vnetp, vresp); } } DBG1(vnetp, "exit\n"); } /* * vnet_stop_resources -- stop all resources associated with a vnet. */ static void vnet_stop_resources(vnet_t *vnetp) { vnet_res_t *vresp; mac_register_t *macp; mac_callbacks_t *cbp; DBG1(vnetp, "enter\n"); ASSERT(RW_WRITE_HELD(&vnetp->vrwlock)); for (vresp = vnetp->vres_list; vresp != NULL; ) { if (vresp->flags & VNET_STARTED) { /* * Release the lock while invoking mc_stop() of the * underlying resource. We hold a reference to this * resource to prevent being removed from the list in * vio_net_resource_unreg(). Note that new resources * can be added to the head of the list while the lock * is released, but they won't be started, as * VNET_STARTED flag has been cleared for the vnet * device in vnet_m_stop(). Also, while the lock is * released a resource could be removed from the list * in vio_net_resource_unreg(); but that is ok, as we * re-acquire the lock and only then access the forward * link (vresp->nextp) to continue with the next * resource. */ vresp->flags &= ~VNET_STARTED; vresp->flags |= VNET_STOPPING; macp = &vresp->macreg; cbp = macp->m_callbacks; VNET_FDBE_REFHOLD(vresp); RW_EXIT(&vnetp->vrwlock); cbp->mc_stop(macp->m_driver); WRITE_ENTER(&vnetp->vrwlock); vresp->flags &= ~VNET_STOPPING; VNET_FDBE_REFRELE(vresp); } vresp = vresp->nextp; } DBG1(vnetp, "exit\n"); } /* * Setup kstats for the HIO statistics. * NOTE: the synchronization for the statistics is the * responsibility of the caller. */ kstat_t * vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp) { kstat_t *ksp; vnet_t *vnetp = vresp->vnetp; vnet_hio_kstats_t *hiokp; size_t size; ASSERT(vnetp != NULL); size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t); ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net", KSTAT_TYPE_NAMED, size, 0); if (ksp == NULL) { return (NULL); } hiokp = (vnet_hio_kstats_t *)ksp->ks_data; kstat_named_init(&hiokp->ipackets, "ipackets", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->ierrors, "ierrors", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->opackets, "opackets", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->oerrors, "oerrors", KSTAT_DATA_ULONG); /* MIB II kstat variables */ kstat_named_init(&hiokp->rbytes, "rbytes", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->obytes, "obytes", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->multircv, "multircv", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->multixmt, "multixmt", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->brdcstrcv, "brdcstrcv", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->brdcstxmt, "brdcstxmt", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->norcvbuf, "norcvbuf", KSTAT_DATA_ULONG); kstat_named_init(&hiokp->noxmtbuf, "noxmtbuf", KSTAT_DATA_ULONG); ksp->ks_update = vnet_hio_update_kstats; ksp->ks_private = (void *)vresp; kstat_install(ksp); return (ksp); } /* * Destroy kstats. */ static void vnet_hio_destroy_kstats(kstat_t *ksp) { if (ksp != NULL) kstat_delete(ksp); } /* * Update the kstats. */ static int vnet_hio_update_kstats(kstat_t *ksp, int rw) { vnet_t *vnetp; vnet_res_t *vresp; vnet_hio_stats_t statsp; vnet_hio_kstats_t *hiokp; vresp = (vnet_res_t *)ksp->ks_private; vnetp = vresp->vnetp; bzero(&statsp, sizeof (vnet_hio_stats_t)); READ_ENTER(&vnetp->vsw_fp_rw); if (vnetp->hio_fp == NULL) { /* not using hio resources, just return */ RW_EXIT(&vnetp->vsw_fp_rw); return (0); } VNET_FDBE_REFHOLD(vnetp->hio_fp); RW_EXIT(&vnetp->vsw_fp_rw); vnet_hio_get_stats(vnetp->hio_fp, &statsp); VNET_FDBE_REFRELE(vnetp->hio_fp); hiokp = (vnet_hio_kstats_t *)ksp->ks_data; if (rw == KSTAT_READ) { /* Link Input/Output stats */ hiokp->ipackets.value.ul = (uint32_t)statsp.ipackets; hiokp->ipackets64.value.ull = statsp.ipackets; hiokp->ierrors.value.ul = statsp.ierrors; hiokp->opackets.value.ul = (uint32_t)statsp.opackets; hiokp->opackets64.value.ull = statsp.opackets; hiokp->oerrors.value.ul = statsp.oerrors; /* MIB II kstat variables */ hiokp->rbytes.value.ul = (uint32_t)statsp.rbytes; hiokp->rbytes64.value.ull = statsp.rbytes; hiokp->obytes.value.ul = (uint32_t)statsp.obytes; hiokp->obytes64.value.ull = statsp.obytes; hiokp->multircv.value.ul = statsp.multircv; hiokp->multixmt.value.ul = statsp.multixmt; hiokp->brdcstrcv.value.ul = statsp.brdcstrcv; hiokp->brdcstxmt.value.ul = statsp.brdcstxmt; hiokp->norcvbuf.value.ul = statsp.norcvbuf; hiokp->noxmtbuf.value.ul = statsp.noxmtbuf; } else { return (EACCES); } return (0); } static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp) { mac_register_t *macp; mac_callbacks_t *cbp; uint64_t val; int stat; /* * get the specified statistics from the underlying nxge. */ macp = &vresp->macreg; cbp = macp->m_callbacks; for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) { if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) { switch (stat) { case MAC_STAT_IPACKETS: statsp->ipackets = val; break; case MAC_STAT_IERRORS: statsp->ierrors = val; break; case MAC_STAT_OPACKETS: statsp->opackets = val; break; case MAC_STAT_OERRORS: statsp->oerrors = val; break; case MAC_STAT_RBYTES: statsp->rbytes = val; break; case MAC_STAT_OBYTES: statsp->obytes = val; break; case MAC_STAT_MULTIRCV: statsp->multircv = val; break; case MAC_STAT_MULTIXMT: statsp->multixmt = val; break; case MAC_STAT_BRDCSTRCV: statsp->brdcstrcv = val; break; case MAC_STAT_BRDCSTXMT: statsp->brdcstxmt = val; break; case MAC_STAT_NOXMTBUF: statsp->noxmtbuf = val; break; case MAC_STAT_NORCVBUF: statsp->norcvbuf = val; break; default: /* * parameters not interested. */ break; } } } } static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data) { vnet_t *vnetp = (vnet_t *)arg; if (vnetp == NULL) { return (0); } switch (cap) { case MAC_CAPAB_RINGS: { mac_capab_rings_t *cap_rings = cap_data; /* * Rings Capability Notes: * We advertise rings to make use of the rings framework in * gldv3 mac layer, to improve the performance. This is * specifically needed when a Hybrid resource (with multiple * tx/rx hardware rings) is assigned to a vnet device. We also * leverage this for the normal case when no Hybrid resource is * assigned. * * Ring Allocation: * - TX path: * We expose a pseudo ring group with 2 pseudo tx rings (as * currently HybridIO exports only 2 rings) In the normal case, * transmit traffic that comes down to the driver through the * mri_tx (vnet_tx_ring_send()) entry point goes through the * distributed switching algorithm in vnet and gets transmitted * over a port/LDC in the vgen layer to either the vswitch or a * peer vnet. If and when a Hybrid resource is assigned to the * vnet, we obtain the tx ring information of the Hybrid device * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings. * Traffic being sent over the Hybrid resource by the mac layer * gets spread across both hw rings, as they are mapped to the * 2 pseudo tx rings in vnet. * * - RX path: * We expose a pseudo ring group with 3 pseudo rx rings (static * rings) initially. The first (default) pseudo rx ring is * reserved for the resource that connects to the vswitch * service. The next 2 rings are reserved for a Hybrid resource * that may be assigned to the vnet device. If and when a * Hybrid resource is assigned to the vnet, we obtain the rx * ring information of the Hybrid device (nxge) and map these * pseudo rings 1:1 to the 2 hw rx rings. For each additional * resource that connects to a peer vnet, we dynamically * allocate a pseudo rx ring and map it to that resource, when * the resource gets added; and the pseudo rx ring is * dynamically registered with the upper mac layer. We do the * reverse and unregister the ring with the mac layer when * the resource gets removed. * * Synchronization notes: * We don't need any lock to protect members of ring structure, * specifically ringp->hw_rh, in either the TX or the RX ring, * as explained below. * - TX ring: * ring->hw_rh is initialized only when a Hybrid resource is * associated; and gets referenced only in vnet_hio_tx(). The * Hybrid resource itself is available in fdb only after tx * hwrings are found and mapped; i.e, in vio_net_resource_reg() * we call vnet_bind_rings() first and then call * vnet_start_resources() which adds an entry to fdb. For * traffic going over LDC resources, we don't reference * ring->hw_rh at all. * - RX ring: * For rings mapped to Hybrid resource ring->hw_rh is * initialized and only then do we add the rx callback for * the underlying Hybrid resource; we disable callbacks before * we unmap ring->hw_rh. For rings mapped to LDC resources, we * stop the rx callbacks (in vgen) before we remove ring->hw_rh * (vio_net_resource_unreg()). * Also, we access ring->hw_rh in vnet_rx_ring_stat(). * Note that for rings mapped to Hybrid resource, though the * rings are statically registered with the mac layer, its * hardware ring mapping (ringp->hw_rh) can be torn down in * vnet_unbind_hwrings() while the kstat operation is in * progress. To protect against this, we hold a reference to * the resource in FDB; this ensures that the thread in * vio_net_resource_unreg() waits for the reference to be * dropped before unbinding the ring. * * We don't need to do this for rings mapped to LDC resources. * These rings are registered/unregistered dynamically with * the mac layer and so any attempt to unregister the ring * while kstat operation is in progress will block in * mac_group_rem_ring(). Thus implicitly protects the * resource (ringp->hw_rh) from disappearing. */ if (cap_rings->mr_type == MAC_RING_TYPE_RX) { cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; /* * The ring_cnt for rx grp is initialized in * vnet_ring_grp_init(). Later, the ring_cnt gets * updated dynamically whenever LDC resources are added * or removed. */ cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt; cap_rings->mr_rget = vnet_get_ring; cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS; cap_rings->mr_gget = vnet_get_group; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; } else { cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; /* * The ring_cnt for tx grp is initialized in * vnet_ring_grp_init() and remains constant, as we * do not support dymanic tx rings for now. */ cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt; cap_rings->mr_rget = vnet_get_ring; /* * Transmit rings are not grouped; i.e, the number of * transmit ring groups advertised should be set to 0. */ cap_rings->mr_gnum = 0; cap_rings->mr_gget = vnet_get_group; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; } return (B_TRUE); } default: break; } return (B_FALSE); } /* * Callback funtion for MAC layer to get ring information. */ static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index, const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle) { vnet_t *vnetp = arg; switch (rtype) { case MAC_RING_TYPE_RX: { vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; mac_intr_t *mintr; /* We advertised only one RX group */ ASSERT(g_index == 0); rx_grp = &vnetp->rx_grp[g_index]; /* Check the current # of rings in the rx group */ ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt)); /* Get the ring based on the index */ rx_ringp = &rx_grp->rings[r_index]; rx_ringp->handle = r_handle; /* * Note: we don't need to save the incoming r_index in rx_ring, * as vnet_ring_grp_init() would have initialized the index for * each ring in the array. */ rx_ringp->grp = rx_grp; rx_ringp->vnetp = vnetp; mintr = &infop->mri_intr; mintr->mi_handle = (mac_intr_handle_t)rx_ringp; mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr; mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr; infop->mri_driver = (mac_ring_driver_t)rx_ringp; infop->mri_start = vnet_rx_ring_start; infop->mri_stop = vnet_rx_ring_stop; infop->mri_stat = vnet_rx_ring_stat; /* Set the poll function, as this is an rx ring */ infop->mri_poll = vnet_rx_poll; /* * MAC_RING_RX_ENQUEUE bit needed to be set for nxge * which was not sending packet chains in interrupt * context. For such drivers, packets are queued in * Rx soft rings so that we get a chance to switch * into a polling mode under backlog. This bug (not * sending packet chains) has now been fixed. Once * the performance impact is measured, this change * will be removed. */ infop->mri_flags = (vnet_mac_rx_queuing ? MAC_RING_RX_ENQUEUE : 0); break; } case MAC_RING_TYPE_TX: { vnet_pseudo_tx_group_t *tx_grp; vnet_pseudo_tx_ring_t *tx_ringp; /* * No need to check grp index; mac layer passes -1 for it. */ tx_grp = &vnetp->tx_grp[0]; /* Check the # of rings in the tx group */ ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt)); /* Get the ring based on the index */ tx_ringp = &tx_grp->rings[r_index]; tx_ringp->handle = r_handle; tx_ringp->index = r_index; tx_ringp->grp = tx_grp; tx_ringp->vnetp = vnetp; infop->mri_driver = (mac_ring_driver_t)tx_ringp; infop->mri_start = vnet_tx_ring_start; infop->mri_stop = vnet_tx_ring_stop; infop->mri_stat = vnet_tx_ring_stat; /* Set the transmit function, as this is a tx ring */ infop->mri_tx = vnet_tx_ring_send; /* * MAC_RING_TX_SERIALIZE bit needs to be set while * hybridIO is enabled to workaround tx lock * contention issues in nxge. */ infop->mri_flags = (vnet_mac_tx_serialize ? MAC_RING_TX_SERIALIZE : 0); break; } default: break; } } /* * Callback funtion for MAC layer to get group information. */ static void vnet_get_group(void *arg, mac_ring_type_t type, const int index, mac_group_info_t *infop, mac_group_handle_t handle) { vnet_t *vnetp = (vnet_t *)arg; switch (type) { case MAC_RING_TYPE_RX: { vnet_pseudo_rx_group_t *rx_grp; /* We advertised only one RX group */ ASSERT(index == 0); rx_grp = &vnetp->rx_grp[index]; rx_grp->handle = handle; rx_grp->index = index; rx_grp->vnetp = vnetp; infop->mgi_driver = (mac_group_driver_t)rx_grp; infop->mgi_start = NULL; infop->mgi_stop = NULL; infop->mgi_addmac = vnet_addmac; infop->mgi_remmac = vnet_remmac; infop->mgi_count = rx_grp->ring_cnt; break; } case MAC_RING_TYPE_TX: { vnet_pseudo_tx_group_t *tx_grp; /* We advertised only one TX group */ ASSERT(index == 0); tx_grp = &vnetp->tx_grp[index]; tx_grp->handle = handle; tx_grp->index = index; tx_grp->vnetp = vnetp; infop->mgi_driver = (mac_group_driver_t)tx_grp; infop->mgi_start = NULL; infop->mgi_stop = NULL; infop->mgi_addmac = NULL; infop->mgi_remmac = NULL; infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS; break; } default: break; } } static int vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; int err; /* * If this ring is mapped to a LDC resource, simply mark the state to * indicate the ring is started and return. */ if ((rx_ringp->state & (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) { rx_ringp->gen_num = mr_gen_num; rx_ringp->state |= VNET_RXRING_STARTED; return (0); } ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); /* * This must be a ring reserved for a hwring. If the hwring is not * bound yet, simply mark the state to indicate the ring is started and * return. If and when a hybrid resource is activated for this vnet * device, we will bind the hwring and start it then. If a hwring is * already bound, start it now. */ if (rx_ringp->hw_rh == NULL) { rx_ringp->gen_num = mr_gen_num; rx_ringp->state |= VNET_RXRING_STARTED; return (0); } err = mac_hwring_activate(rx_ringp->hw_rh); if (err == 0) { rx_ringp->gen_num = mr_gen_num; rx_ringp->state |= VNET_RXRING_STARTED; } else { err = ENXIO; } return (err); } static void vnet_rx_ring_stop(mac_ring_driver_t arg) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; /* * If this ring is mapped to a LDC resource, simply mark the state to * indicate the ring is now stopped and return. */ if ((rx_ringp->state & (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) { rx_ringp->state &= ~VNET_RXRING_STARTED; return; } ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); /* * This must be a ring reserved for a hwring. If the hwring is not * bound yet, simply mark the state to indicate the ring is stopped and * return. If a hwring is already bound, stop it now. */ if (rx_ringp->hw_rh == NULL) { rx_ringp->state &= ~VNET_RXRING_STARTED; return; } mac_hwring_quiesce(rx_ringp->hw_rh); rx_ringp->state &= ~VNET_RXRING_STARTED; } static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver; vnet_t *vnetp = (vnet_t *)rx_ringp->vnetp; vnet_res_t *vresp; mac_register_t *macp; mac_callbacks_t *cbp; /* * Refer to vnet_m_capab() function for detailed comments on ring * synchronization. */ if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) { READ_ENTER(&vnetp->vsw_fp_rw); if (vnetp->hio_fp == NULL) { RW_EXIT(&vnetp->vsw_fp_rw); return (0); } VNET_FDBE_REFHOLD(vnetp->hio_fp); RW_EXIT(&vnetp->vsw_fp_rw); (void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val); VNET_FDBE_REFRELE(vnetp->hio_fp); return (0); } ASSERT((rx_ringp->state & (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0); vresp = (vnet_res_t *)rx_ringp->hw_rh; macp = &vresp->macreg; cbp = macp->m_callbacks; (void) cbp->mc_getstat(macp->m_driver, stat, val); return (0); } /* ARGSUSED */ static int vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num) { vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg; tx_ringp->state |= VNET_TXRING_STARTED; return (0); } static void vnet_tx_ring_stop(mac_ring_driver_t arg) { vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg; tx_ringp->state &= ~VNET_TXRING_STARTED; } static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) { vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver; vnet_tx_ring_stats_t *statsp; statsp = &tx_ringp->tx_ring_stats; switch (stat) { case MAC_STAT_OPACKETS: *val = statsp->opackets; break; case MAC_STAT_OBYTES: *val = statsp->obytes; break; default: *val = 0; return (ENOTSUP); } return (0); } /* * Disable polling for a ring and enable its interrupt. */ static int vnet_ring_enable_intr(void *arg) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; vnet_res_t *vresp; if (rx_ringp->hw_rh == NULL) { /* * Ring enable intr func is being invoked, but the ring is * not bound to any underlying resource ? This must be a ring * reserved for Hybrid resource and no such resource has been * assigned to this vnet device yet. We simply return success. */ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); return (0); } /* * The rx ring has been bound to either a LDC or a Hybrid resource. * Call the appropriate function to enable interrupts for the ring. */ if (rx_ringp->state & VNET_RXRING_HYBRID) { return (mac_hwring_enable_intr(rx_ringp->hw_rh)); } else { vresp = (vnet_res_t *)rx_ringp->hw_rh; return (vgen_enable_intr(vresp->macreg.m_driver)); } } /* * Enable polling for a ring and disable its interrupt. */ static int vnet_ring_disable_intr(void *arg) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; vnet_res_t *vresp; if (rx_ringp->hw_rh == NULL) { /* * Ring disable intr func is being invoked, but the ring is * not bound to any underlying resource ? This must be a ring * reserved for Hybrid resource and no such resource has been * assigned to this vnet device yet. We simply return success. */ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); return (0); } /* * The rx ring has been bound to either a LDC or a Hybrid resource. * Call the appropriate function to disable interrupts for the ring. */ if (rx_ringp->state & VNET_RXRING_HYBRID) { return (mac_hwring_disable_intr(rx_ringp->hw_rh)); } else { vresp = (vnet_res_t *)rx_ringp->hw_rh; return (vgen_disable_intr(vresp->macreg.m_driver)); } } /* * Poll 'bytes_to_pickup' bytes of message from the rx ring. */ static mblk_t * vnet_rx_poll(void *arg, int bytes_to_pickup) { vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; mblk_t *mp = NULL; vnet_res_t *vresp; vnet_t *vnetp = rx_ringp->vnetp; if (rx_ringp->hw_rh == NULL) { return (NULL); } if (rx_ringp->state & VNET_RXRING_HYBRID) { mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup); /* * Packets received over a hybrid resource need additional * processing to remove the tag, for the pvid case. The * underlying resource is not aware of the vnet's pvid and thus * packets are received with the vlan tag in the header; unlike * packets that are received over a ldc channel in which case * the peer vnet/vsw would have already removed the tag. */ if (vnetp->pvid != vnetp->default_vlan_id) { vnet_rx_frames_untag(vnetp->pvid, &mp); } } else { vresp = (vnet_res_t *)rx_ringp->hw_rh; mp = vgen_rx_poll(vresp->macreg.m_driver, bytes_to_pickup); } return (mp); } /* ARGSUSED */ void vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) { vnet_t *vnetp = (vnet_t *)arg; vnet_pseudo_rx_ring_t *ringp = (vnet_pseudo_rx_ring_t *)mrh; /* * Packets received over a hybrid resource need additional processing * to remove the tag, for the pvid case. The underlying resource is * not aware of the vnet's pvid and thus packets are received with the * vlan tag in the header; unlike packets that are received over a ldc * channel in which case the peer vnet/vsw would have already removed * the tag. */ if (vnetp->pvid != vnetp->default_vlan_id) { vnet_rx_frames_untag(vnetp->pvid, &mp); if (mp == NULL) { return; } } mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num); } static int vnet_addmac(void *arg, const uint8_t *mac_addr) { vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg; vnet_t *vnetp; vnetp = rx_grp->vnetp; if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) { return (0); } cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n", vnetp->instance, __func__); return (EINVAL); } static int vnet_remmac(void *arg, const uint8_t *mac_addr) { vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg; vnet_t *vnetp; vnetp = rx_grp->vnetp; if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) { return (0); } cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n", vnetp->instance, __func__, ether_sprintf((void *)mac_addr)); return (EINVAL); } int vnet_hio_mac_init(vnet_t *vnetp, char *ifname) { mac_handle_t mh; mac_client_handle_t mch = NULL; mac_unicast_handle_t muh = NULL; mac_diag_t diag; mac_register_t *macp; char client_name[MAXNAMELEN]; int rv; uint16_t mac_flags = MAC_UNICAST_TAG_DISABLE | MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY; vio_net_callbacks_t vcb; ether_addr_t rem_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; uint32_t retries = 0; if ((macp = mac_alloc(MAC_VERSION)) == NULL) { return (EAGAIN); } do { rv = mac_open_by_linkname(ifname, &mh); if (rv == 0) { break; } if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) { mac_free(macp); return (rv); } drv_usecwait(vnet_mac_open_delay); } while (rv == ENOENT); vnetp->hio_mh = mh; (void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance, ifname); rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE); if (rv != 0) { goto fail; } vnetp->hio_mch = mch; rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0, &diag); if (rv != 0) { goto fail; } vnetp->hio_muh = muh; macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; macp->m_driver = vnetp; macp->m_dip = NULL; macp->m_src_addr = NULL; macp->m_callbacks = &vnet_hio_res_callbacks; macp->m_min_sdu = 0; macp->m_max_sdu = ETHERMTU; rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID, vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb); if (rv != 0) { goto fail; } mac_free(macp); /* add the recv callback */ mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp); return (0); fail: mac_free(macp); vnet_hio_mac_cleanup(vnetp); return (1); } void vnet_hio_mac_cleanup(vnet_t *vnetp) { if (vnetp->hio_vhp != NULL) { vio_net_resource_unreg(vnetp->hio_vhp); vnetp->hio_vhp = NULL; } if (vnetp->hio_muh != NULL) { (void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh); vnetp->hio_muh = NULL; } if (vnetp->hio_mch != NULL) { mac_client_close(vnetp->hio_mch, 0); vnetp->hio_mch = NULL; } if (vnetp->hio_mh != NULL) { mac_close(vnetp->hio_mh); vnetp->hio_mh = NULL; } } /* Bind pseudo rings to hwrings */ static int vnet_bind_hwrings(vnet_t *vnetp) { mac_ring_handle_t hw_rh[VNET_NUM_HYBRID_RINGS]; mac_perim_handle_t mph1; vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; vnet_pseudo_tx_group_t *tx_grp; vnet_pseudo_tx_ring_t *tx_ringp; int hw_ring_cnt; int i; int rv; mac_perim_enter_by_mh(vnetp->hio_mh, &mph1); /* Get the list of the underlying RX rings. */ hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh, MAC_RING_TYPE_RX); /* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */ if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) { cmn_err(CE_WARN, "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n", vnetp->instance, hw_ring_cnt); goto fail; } if (vnetp->rx_hwgh != NULL) { /* * Quiesce the HW ring and the mac srs on the ring. Note * that the HW ring will be restarted when the pseudo ring * is started. At that time all the packets will be * directly passed up to the pseudo RX ring and handled * by mac srs created over the pseudo RX ring. */ mac_rx_client_quiesce(vnetp->hio_mch); mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE); } /* * Bind the pseudo rings to the hwrings and start the hwrings. * Note we don't need to register these with the upper mac, as we have * statically exported these pseudo rxrings which are reserved for * rxrings of Hybrid resource. */ rx_grp = &vnetp->rx_grp[0]; for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { /* Pick the rxrings reserved for Hybrid resource */ rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX]; /* Store the hw ring handle */ rx_ringp->hw_rh = hw_rh[i]; /* Bind the pseudo ring to the underlying hwring */ mac_hwring_setup(rx_ringp->hw_rh, (mac_resource_handle_t)rx_ringp, NULL); /* Start the hwring if needed */ if (rx_ringp->state & VNET_RXRING_STARTED) { rv = mac_hwring_activate(rx_ringp->hw_rh); if (rv != 0) { mac_hwring_teardown(rx_ringp->hw_rh); rx_ringp->hw_rh = NULL; goto fail; } } } /* Get the list of the underlying TX rings. */ hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh, MAC_RING_TYPE_TX); /* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */ if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) { cmn_err(CE_WARN, "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n", vnetp->instance, hw_ring_cnt); goto fail; } /* * Now map the pseudo txrings to the hw txrings. Note we don't need * to register these with the upper mac, as we have statically exported * these rings. Note that these rings will continue to be used for LDC * resources to peer vnets and vswitch (shared ring). */ tx_grp = &vnetp->tx_grp[0]; for (i = 0; i < tx_grp->ring_cnt; i++) { tx_ringp = &tx_grp->rings[i]; tx_ringp->hw_rh = hw_rh[i]; tx_ringp->state |= VNET_TXRING_HYBRID; } tx_grp->tx_notify_handle = mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp); mac_perim_exit(mph1); return (0); fail: mac_perim_exit(mph1); vnet_unbind_hwrings(vnetp); return (1); } /* Unbind pseudo rings from hwrings */ static void vnet_unbind_hwrings(vnet_t *vnetp) { mac_perim_handle_t mph1; vnet_pseudo_rx_ring_t *rx_ringp; vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_tx_group_t *tx_grp; vnet_pseudo_tx_ring_t *tx_ringp; int i; mac_perim_enter_by_mh(vnetp->hio_mh, &mph1); tx_grp = &vnetp->tx_grp[0]; for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { tx_ringp = &tx_grp->rings[i]; if (tx_ringp->state & VNET_TXRING_HYBRID) { tx_ringp->state &= ~VNET_TXRING_HYBRID; tx_ringp->hw_rh = NULL; } } (void) mac_client_tx_notify(vnetp->hio_mch, NULL, tx_grp->tx_notify_handle); rx_grp = &vnetp->rx_grp[0]; for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX]; if (rx_ringp->hw_rh != NULL) { /* Stop the hwring */ mac_hwring_quiesce(rx_ringp->hw_rh); /* Teardown the hwring */ mac_hwring_teardown(rx_ringp->hw_rh); rx_ringp->hw_rh = NULL; } } if (vnetp->rx_hwgh != NULL) { vnetp->rx_hwgh = NULL; /* * First clear the permanent-quiesced flag of the RX srs then * restart the HW ring and the mac srs on the ring. */ mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE); mac_rx_client_restart(vnetp->hio_mch); } mac_perim_exit(mph1); } /* Bind pseudo ring to a LDC resource */ static int vnet_bind_vgenring(vnet_res_t *vresp) { vnet_t *vnetp; vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; mac_perim_handle_t mph1; int rv; int type; vnetp = vresp->vnetp; type = vresp->type; rx_grp = &vnetp->rx_grp[0]; if (type == VIO_NET_RES_LDC_SERVICE) { /* * Ring Index 0 is the default ring in the group and is * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring * is allocated statically and is reported to the mac layer * in vnet_m_capab(). So, all we need to do here, is save a * reference to the associated vresp. */ rx_ringp = &rx_grp->rings[0]; rx_ringp->hw_rh = (mac_ring_handle_t)vresp; vresp->rx_ringp = (void *)rx_ringp; return (0); } ASSERT(type == VIO_NET_RES_LDC_GUEST); mac_perim_enter_by_mh(vnetp->mh, &mph1); rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp); if (rx_ringp == NULL) { cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring", vnetp->instance); goto fail; } /* Store the LDC resource itself as the ring handle */ rx_ringp->hw_rh = (mac_ring_handle_t)vresp; /* * Save a reference to the ring in the resource for lookup during * unbind. Note this is only done for LDC resources. We don't need this * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its * rx rings are mapped to reserved pseudo rx rings (index 1 and 2). */ vresp->rx_ringp = (void *)rx_ringp; rx_ringp->state |= VNET_RXRING_LDC_GUEST; /* Register the pseudo ring with upper-mac */ rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index); if (rv != 0) { rx_ringp->state &= ~VNET_RXRING_LDC_GUEST; rx_ringp->hw_rh = NULL; vnet_free_pseudo_rx_ring(vnetp, rx_ringp); goto fail; } mac_perim_exit(mph1); return (0); fail: mac_perim_exit(mph1); return (1); } /* Unbind pseudo ring from a LDC resource */ static void vnet_unbind_vgenring(vnet_res_t *vresp) { vnet_t *vnetp; vnet_pseudo_rx_group_t *rx_grp; vnet_pseudo_rx_ring_t *rx_ringp; mac_perim_handle_t mph1; int type; vnetp = vresp->vnetp; type = vresp->type; rx_grp = &vnetp->rx_grp[0]; if (vresp->rx_ringp == NULL) { return; } if (type == VIO_NET_RES_LDC_SERVICE) { /* * Ring Index 0 is the default ring in the group and is * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring * is allocated statically and is reported to the mac layer * in vnet_m_capab(). So, all we need to do here, is remove its * reference to the associated vresp. */ rx_ringp = &rx_grp->rings[0]; rx_ringp->hw_rh = NULL; vresp->rx_ringp = NULL; return; } ASSERT(type == VIO_NET_RES_LDC_GUEST); mac_perim_enter_by_mh(vnetp->mh, &mph1); rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp; vresp->rx_ringp = NULL; if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) { /* Unregister the pseudo ring with upper-mac */ mac_group_rem_ring(rx_grp->handle, rx_ringp->handle); rx_ringp->hw_rh = NULL; rx_ringp->state &= ~VNET_RXRING_LDC_GUEST; /* Free the pseudo rx ring */ vnet_free_pseudo_rx_ring(vnetp, rx_ringp); } mac_perim_exit(mph1); } static void vnet_unbind_rings(vnet_res_t *vresp) { switch (vresp->type) { case VIO_NET_RES_LDC_SERVICE: case VIO_NET_RES_LDC_GUEST: vnet_unbind_vgenring(vresp); break; case VIO_NET_RES_HYBRID: vnet_unbind_hwrings(vresp->vnetp); break; default: break; } } static int vnet_bind_rings(vnet_res_t *vresp) { int rv; switch (vresp->type) { case VIO_NET_RES_LDC_SERVICE: case VIO_NET_RES_LDC_GUEST: rv = vnet_bind_vgenring(vresp); break; case VIO_NET_RES_HYBRID: rv = vnet_bind_hwrings(vresp->vnetp); break; default: rv = 1; break; } return (rv); } /* ARGSUSED */ int vnet_hio_stat(void *arg, uint_t stat, uint64_t *val) { vnet_t *vnetp = (vnet_t *)arg; *val = mac_stat_get(vnetp->hio_mh, stat); return (0); } /* * The start() and stop() routines for the Hybrid resource below, are just * dummy functions. This is provided to avoid resource type specific code in * vnet_start_resources() and vnet_stop_resources(). The starting and stopping * of the Hybrid resource happens in the context of the mac_client interfaces * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup(). */ /* ARGSUSED */ static int vnet_hio_start(void *arg) { return (0); } /* ARGSUSED */ static void vnet_hio_stop(void *arg) { } mblk_t * vnet_hio_tx(void *arg, mblk_t *mp) { vnet_pseudo_tx_ring_t *tx_ringp; mblk_t *nextp; mblk_t *ret_mp; tx_ringp = (vnet_pseudo_tx_ring_t *)arg; for (;;) { nextp = mp->b_next; mp->b_next = NULL; ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp); if (ret_mp != NULL) { ret_mp->b_next = nextp; mp = ret_mp; break; } if ((mp = nextp) == NULL) break; } return (mp); } #ifdef VNET_IOC_DEBUG /* * The ioctl entry point is used only for debugging for now. The ioctl commands * can be used to force the link state of the channel connected to vsw. */ static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp) { struct iocblk *iocp; vnet_t *vnetp; iocp = (struct iocblk *)(uintptr_t)mp->b_rptr; iocp->ioc_error = 0; vnetp = (vnet_t *)arg; if (vnetp == NULL) { miocnak(q, mp, 0, EINVAL); return; } switch (iocp->ioc_cmd) { case VNET_FORCE_LINK_DOWN: case VNET_FORCE_LINK_UP: vnet_force_link_state(vnetp, q, mp); break; default: iocp->ioc_error = EINVAL; miocnak(q, mp, 0, iocp->ioc_error); break; } } static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp) { mac_register_t *macp; mac_callbacks_t *cbp; vnet_res_t *vresp; READ_ENTER(&vnetp->vsw_fp_rw); vresp = vnetp->vsw_fp; if (vresp == NULL) { RW_EXIT(&vnetp->vsw_fp_rw); return; } macp = &vresp->macreg; cbp = macp->m_callbacks; cbp->mc_ioctl(macp->m_driver, q, mp); RW_EXIT(&vnetp->vsw_fp_rw); } #else static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp) { vnet_t *vnetp; vnetp = (vnet_t *)arg; if (vnetp == NULL) { miocnak(q, mp, 0, EINVAL); return; } /* ioctl support only for debugging */ miocnak(q, mp, 0, ENOTSUP); } #endif