/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2024 Oxide Computer Company */ /* * Intel I225/226 Ethernet Driver * ------------------------------ * * This driver implements support for the Intel I225 and I226 Ethernet * controllers which support up to 2.5 GbE and generally only supports BASE-T * copper phys. This device is yet another variant on the venerable Intel 1 GbE * devices that are found in e1000g(4D) and igb(4D). This is its own driver in * part because that's how Intel did things and refactored their common code * which we import and is found in the 'core' directory. * * There is not a good datasheet for the MAC that we've been able to find for * this part. It's not clear that Intel even has a doc for this in their * Resource and Design Center. The primary datasheet documents the NVM and other * parts of it, but not the software interface. Based on observations from the * common code we describe this as somewhat of an evolution of the I217 and * I210, with less features than the I210, which comes from the server world * (which ws itself a more stripped down I350). * * The result of all this is us trying to focus on what we know about this part * and making some assumptions along the way. This includes things like: * * 1) We believe that the device only supports up to 4 RX and TX queues. * 2) There is only one TX context for each TX queue and it is mapped to the * queue. * 3) There is no support for the head writeback modes that we've found. * 4) This does otherwise support both the MSI-X and MSI/INTx interrupt * management which are shaped very differently in the device. * 5) The 2500BASE-T PHY support is unique, but the other PHY settings are * roughly the same as far as we can tell. * * There are certainly more differences than the points up above, but the above * are ones that generally influence our design. * * ------------ * Organization * ------------ * * This driver is first broken into two different pieces. There is the 'core' * code which we import from Intel via FreeBSD. All of these sources are in the * 'uts/common/io/igc/core' directory and we try our hardest to avoid modifying * them (hence the smatch gags). The core code can be thought of as abstracting * the MAC, NVM, and PHY across different chipsets (right now it's all the I225) * and providing us with a series of library calls that we can do to manage the * chip. * * The remaining files that sit alongside this one implement different portions * of functionality related to the device. In particular: * * igc.[ch]: This is the main entry point for the driver and the * source of this block comment. It implements all of the * basic DDI entry points: attach and detach, interrupts, * PCI config space and register set up and tear down. * * The header file contains all of the structure * definitions that we use throughout this and the basic * constants we use for sizing. * * igc_gld.c This file implements all of the major GLDv3 required * entry points that are found in mac(9E). The guts of the * actual I/O are in igc_ring.c, but getting and setting * all of the various MAC properties and other bits is * here. * * igc_osdep.[ch] The osdep (OS dependent) files, are used to define and * implement the functionality required by the common code. * igc_osdep.h is included in the build of each file. * * We have a second use for igc_osdep.h which is where we * put missing hardware definitions that apply. This is for * cases where the core code doesn't have it and it should * really live in igc_defines.h or igc_regs.h, but we keep * it here to avoid modifying those. * * igc_ring.c This implements the core I/O routines of the device * driver, starting with the descriptor ring setup and tear * down as well as DMA, descriptor ring, and per-frame * memory. It also implements all of the primary logic for * transmitting and receiving frames. * * igc_stat.c This file deals with kstat creation and destruction as * well as reading and fetching all of the registers that * exist in hardware. * * There are a few primary data structures to be aware of. Their relationships * are shown in the following image and then described. Note, each structure has * many more fields than those pictured: * * +---------------+ * | dev_info_t * | * | -|-+ * | private data | | * +---------------+ v * +------------------------------+ +---------------------+ * | igc_t | | igc_addr_t | * | per-instance primary | +---->| | * | structure | |+--->| Notes a MAC address | ... * | | || | stored in hardware | * | igc_addr_t *igc_ucast -|--+| +---------------------+ * | igc_addr_t *igc_mcast -|---+ +---------------------------+ * | struct igc_hw *igc_hw -|--------->| struct igc_hw (core code) | * | igc_tx_ring_t *igc_tx_rings -|--+ | | * | igc_rx_ring_t *igc_rx_rings -|--|---+ | igc_mac_info mac | * +------------------------------+ | | | igc_fc_info fc | * | | | igc_phy_info phy | * +----------------------------------+ | | igc_nvm_info nvm | * | v +---------------------------+ * | +--------------------------------------+ * | | igc_rx_ring_t | * | | | * | | igc_adv_rx_desc *irr_ring ---|--> rx hw descriptor ring * | | uint32_t irr_next ---|--> next entry to look for data * | | igc_rx_buffer_t **irr_work_list ---|--> corresponds to ring entries * | | uint32_t irr_nfree ---|--> number of free list entries * | | igc_rx_buffer_t **irr_free_list ---|--> set of buffers free for bind * | | igc_rx_buffer_t *irr_arena ---|-+> array of all rx buffers * | +--------------------------------------+ | * | | * | +----------------------------+ | * | | igc_rx_buffer_t |<--+ * | | | * | | mblk_t *igb_mp -|---> mblk_t for rx buffer * | | igc_dma_buffer_t irb_dma -|---> DMA memory for rx buffer * | +----------------------------+ * | * | +------------------------------------+ * +-->| igc_tx_ring_t | * | | * | icc_adv_tx_desc *itr_ring --|--> tx hw descriptor ring * | uin32_t itr_ring_head --|--> next descriptor to recycle * | uin32_t itr_ring_fail --|--> next descriptor to place * | uin32_t itr_ring_free --|--> free descriptors in ring * | igc_tx_buffer_t **itr_work_list |--> corresponds to ring entries * | list_t itr_free_list --|--> available tx buffers * | igc_tx_buffer_t *itr_arena --|-+> array of all tx buffers * +------------------------------------+ | * | * +---------------------------------+ | * | igc_tx_buffer_t |<-+ * | | * | mblk_t *itb_mp --|--> mblk to tx (only in first) * | igc_dma_buffer_t itb_dma --|--> tx DMA buffer for copy * | ddi_dma_handle_t itb_bind_hdl --|--> DMA handle for bind * +---------------------------------+ * * igc_t This is the primary data structure that exists for each * instance of the driver. There is generally a 1:1 * relationship between a physical port, an instance of the * driver, and a PCI function. This structure provides * access to the device's registers and it embeds the * common code's struct igc_hw. * * struct igc_hw This structure is used by the core code and it contains * information related to the MAC, PHY, NVM, and related * information that the device uses. In general, this * structure is used when performing API calls to the * common code. The common code calls back into us in the * igc_osdep.c interfaces. * * igc_tx_ring_t This structure represents a single transmit ring in * hardware, its associated software state, and * miscellaneous data like statistics, MAC handles, etc. * See the 'TX Data Path Design' section for more * information. * * igc_rx_ring_t This is the receive variant of a ring. It represents and * tracks the hardware state along with all our metadata. * One of these exists for each receive ring that we've * enabled (currently one). See the 'RX Data Path Design' * section for more information. * * igc_tx_buffer_t This represents a single tx buffer in the driver. A tx * buffer contains DMA based storage that it can use to * transmit a packet and contains a second DMA handle that * can be used to bind a specific mblk_t to it. tx buffers * are capped at the current page size and can be smaller * if the maximum packet size is smaller. A 1500 byte MTU * will end up with a 2 KiB buffer due to the device's * internal alignment requirements. * * igc_rx_buffer_t This represents a single rx buffer in the driver. These * buffers may be loaned up to MAC and then returned to us * later. They contain a single DMA buffer which right now * is a single contiguous buffer that fits the maximum * packet size. Each buffer has a corresponding mblk_t that * it is mapped to. * * igc_dma_buffer_t This represent a DMA buffer in the system. DMA buffers * are used for transmit buffers, receive buffers, or * various ring descriptor entries. The DMA buffer * structure is not inherently limited to a specific number * of cookies. It is always mapped in our virtual address * space and encapsulates the various DDI functions. In * general, one expects to interface with the idb_va member * when needing to access the memory, the idb_size member * when wanting to understand how much memory is in the * buffer, and the idb_hdl member when needing to access * the DMA cookies. * * igc_addr_t This represents a 48-bit Ethernet MAC address slot in * the hardware that may or may not be used at any given * point in time. * * -------------------- * Rings and Interrupts * -------------------- * * The I225/226 controller like the I210 supports up to 4 rx and tx rings. Due * to the long history of this controller and its tradition from the e1000g/igb * days and much older parts like the 8254x series, it has two entirely * different sets of interrupt modes. One where MSI-X is used and a mode where * a single MSI or INTx interrupt is used. Currently the driver only supports * the MSI-X mode as that gives us more flexibility and due to the fact that the * interrupt modes and register handling are different, reduces the complexity * in the driver. * * The hardware uses its IVAR registers to map specific queues to interrupts. * Each rx queue and tx queue is mapped to a specific bit position in the IVAR * and there is an additional IVAR register for miscellaneous causes like link * state changes. While the IVAR register allows for several bits for MSI-X * entries, for the most part, it appears that there is only support for values * in the range [0, 4] based on the I210 which we believe extends to the I225/6. * * MSI-X mode causes the device's various interrupt registers to be split into * two groups the 'legacy' and 'extended' (sometimes called advanced) ones. The * extended ones all start with 'E'. When in MSI-X mode, the EICR (cause), EICS * (cause set), EIAC (auto-clear), EIMS (mask set) registers all operate with * indexes that refer to the MSI-X. The primary way to temporarily disable * interrupts for polling is to remove the given MSI-X from the auto-clear set * and to clear it from the enabled mask set. * * The implication of all of this is that we can only really disable interrupts * for polling on a per-MSI-X basis. This generally means that the design for * interrupts and rings is that all the tx rings and the link state change * events share interrupt 0, while rx rings use interrupts 1-4. Because the x86 * 'apix' modules end up defaulting to two interrupts to a driver, we end up * only supporting a single rx and tx ring for the time being, though the driver * is phrased in terms of a variable number of such rings. * * ------------------- * RX Data Path Design * ------------------- * * The rx data path is based around allocating a fixed number of receive buffers * for each ring. We have two goals in the allocation buffer and ring design: * * 1) We want to make sure that the ring is always full of valid descriptors for * rx to prevent stalls. One implication of this is that we will always * refill a received buffer with a new one and notify the hardware that the * buffer is usable again. * * 2) We would prefer to not have to copy received memory and instead bind the * DMA memory directly into an mblk_t. * * To satisfy (1) we need to allocate at least as many rx buffers as there are * ring entries. The ring is sized by default to 512 entries, which is a * somewhat arbitrary, but common, size. We then say that we want to be able to * loan half of our entries up the stack at any given time. This leads to us * allocating 1.5x the ring size rx buffers. * * All of the rx buffers are stored in the irr_arena array. They are then split * between the free list and the ring's work list. The work list is an array * that is a 1:1 mapping to a location in the descriptor ring. That is index 4 * of the work list (irr_work_list[4]) corresponds to index 4 of the descriptor * ring (irr_ring[4]). However, this may refer to any of the rx descriptors that * is in the irr_arena. When we start up the ring, the first ring size entries * are all inserted into the work list and then the remaining entries are * inserted into the free list. * * Entries that are in the work list are always given to hardware. We track the * next place for us to scan for received packets through the 'irr_next' index * into the descriptor ring. When an interrupt fires, we start at irr_next and * iterate through the descriptor ring continuing while we find valid, received * packets. When we process a packet, we look at two things to consider whether * we bind it or copy it to a new mblk_t. The first piece is the received * packet's length. If the packet is small, there is not much value in binding * it and instead we just allocate and copy a new buffer for the packet. * * The second is if there are free rx descriptors. To keep goal (1) valid, we * only will loan a packet up if there is an entry on the free list that can * replace the rx buffer, as otherwise we'd want to make sure we don't stall the * ring. If an rx buffer is loaned, the entry on the free list takes its place * in the descriptor ring and when the networking stack is finally done with the * mblk_t, it'll be returned to us as part of the freemsg()/freeb() destructor. * This lifetime is illustrated in the following diagram: * * * +-------------+ +-----------+ * | Work List |<---*-------------------| Free List | * | Owned by HW | . . Used to replace | Idle | * +-------------+ loaned buffers +-----------+ * | | ^ ^ * | | . . . Reused if a | * | +-+ copy is done . . . Returned to driver via * | | freemsg() which calls * | | igc_rx_recycle(). * v | * +-------------------+ | * | Loaned |------------------------+ * | Owned by netstack | * +-------------------+ * * Currently the rx data path uses rx buffers that are equal to the maximum size * of a packet (rounded up based on hardware's 1 KiB alignment requirement). * This was mostly done for initial simplicity, though it comes at a memory * cost. It is possible to design this to be more like the tx subsystem where we * use fixed page size buffers and just cons up an mblk_t chain with b_cont * pointers. * * ------------------- * TX Data Path Design * ------------------- * * The tx data path is a bit different in design from the rx data path. When the * system wants to tx data there are two fundamental building blocks that we * use, both of which leverage the igc_tx_buffer_t: * * 1) We use the DMA memory that is allocated with the buffer and copy the * mblk_t data into it. This is used when we have small mblk_t's. * * 2) We utilize the DMA handle that is in the tx buffer (but not the buffer's * DMA memory) to perform DMA binding. This can result in multiple cookies * and therefore descriptors mapping to the single buffer. * * Because a given tx buffer may end up using more than one descriptor and we * have to account for transmit context descriptors, which are used for * indicating checksum and segmentation offloads, we end up only allocating a * number of transmit buffers equal to the ring size. In addition, the tx data * buffer's maximum size is capped at the size of a single page. This is done * because we often aren't going to be copying and if we are, we don't need that * much more memory. The actual size may be smaller depending on the MTU. * * The tx descriptor ring is used in a bit of a different way. While part of the * reason for this is that we are filling it based on the stack's demands and * therefore only need to fill in descriptors when there's a need, the second * reason is because of how the hardware reports back events. There are two * major kinds of descriptors that can be entered into the ring. There are the * aforementioned context descriptors and then data descriptors. While data * descriptors support an interrupt on completion, context descriptors do not. * * When an mblk_t comes in to be transmitted, we walk all of the mblk_t's * associated with it via the b_cont pointer. For each one, we look at the size * of the data and determine whether or not to perform DMA binding or to copy it * into the current tx buffer. A given tx buffer can be used to copy multiple * different mblk_t's. Imagine a pathological case where we had a 500 byte * packet split into 125 byte chunks, this would end up using a single tx data * buffer. However, if you imagine a large chunk of TCP data, this may be * spread across several mblk_t's so we may end up leveraging multiple tx data * buffers. * * The transmit buffers that are available are stored on a free list. This is * managed as a list_t as we end up needing to often track groups of descriptors * to allocate and free across packet transmit and recycling. We don't count the * number of transmit buffers that are free per se, but it generally tracks the * number of free descriptors which do track as in the worst case there is a 1:1 * relationship between buffers and descriptors and more generally it's 1:n, * that is there are multiple descriptors used for a single buffer. * * The transmit ring is managed through a combination of three integers, the * itr_ring_head, the itr_ring_tail, and the itr_ring_free. The ring's tail * represents the place where the driver will place new data to transmit. The * ring's head represents the first place that we should check for a packet's * completion when we're performing recycling (the act of acknowledging what * hardware has processed internal to the driver) due to a tx interrupt or * manual recycling in the transmit path. * * When placing a packet as a series of descriptor rings we'll end up doing the * following: * * 1) First we determine how to map each mblk_t as mentioned above. * 2) This will then be turned into descriptors in the ring. Each tx data buffer * that is used is placed in the itr_work_list at the corresponding index * that they are used in the ring. There is one special case here, if a * context descriptor is used, the first transmit buffer will refer to the * context descriptor's entry (which always comes before data). * 3) We'll ensure that there are enough descriptors for this packet to fit into * the ring or if it would exceed our mandatory gap threshold. If so, then * we'll undo all the work we just did and return the mblk_t to MAC and * indicate that the ring is blocked. MAC will be notified later when we free * up transmit descriptors. * 4) In the first transmit data buffer we'll store both the mblk_t and then * we'll store what the index of the last descriptor that's used is. This is * important for recycling. We also indicate that the last descriptor should * be the one that reports its status on interrupt completion. * 5) We'll notify hardware that there is data for it to transmit by writing to * the ring's tail pointer. * * This all works reasonably okay, except for the small problem of the bill, * which we pay off in the form of recycling. Recycling is going through the * ring and seeing which descriptors are free. While the transmit path described * above is the only path that is allowed to move the tail, the recycling path * is the only one that's allowed to adjust the head. * * When we perform recycling we look at the current head and its corresponding * tx buffer. There will always be a tx buffer in the same index in the * itr_work_list[] unless a serious programmer error has occurred. This buffer * will tell us what the index to check for completion is via its itb_last_desc * member (only valid when itb_first is set to true). If this index indicates * that it has been processed by hardware, then we process all entries between * here and there. * * When we process descriptors, we bunch up the transmit descriptors and * mblk_t's. We'll reset the transmit descriptor (freeing any DMA binding if * used) and append the mblk_t if it exists to be freed in one large * freemsgchain() at the end. The fact that we won't free any tx buffers * associated with a packet until they're all done is important. This makes * sure that any memory that we have bound from the mblk_t remains valid the * entire time. * * If we have freed enough descriptors as part of this to allow mac to send data * again, then once we have finished all processing and dropped the lock, we * will notify MAC. * * When we are processing descriptors here we try to avoid holding the itr_lock * except for the start and end of the process. This is an important way to * ensure that we don't block transmits. Because of this, there can only be one * thread performing a recycle at any given time between the interrupt path and * the transmit path trying to clean up. This is maintained using the * 'itr_recycle' boolean. If a recycle is already in progress then there's * generally not much reason to perform one simultaneously and so the caller * will just return. This is why the head (and thus returning descriptors) is * only used by the recycle path. * * ------- * Locking * ------- * * Mutexes exist on three different structures in the driver: * * 1) igc_t (igc_lock) * 2) igc_rx_ring_t (irr_lock, irr_free_lock) * 3) igc_tx_ring_t (itr_lock) * * The following rules hold for locking in the driver: * * 1) One should not hold locks for both the rx rings and tx rings at the same * time. If this is required, please determine if it is absolutely necessary. * 2) You should always take the controller's lock ahead of any ring's locks. * 3) The general rx ring lock (irr_lock) should be taken ahead of the free list * lock (irr_free_lock) if both are required. * * ------------------- * Future Improvements * ------------------- * * This driver was initially written with an eye towards getting something that * had broad use for folks with this hardware and not towards enabling every * feature immediately. Here are some areas that can be improved upon in the * driver. * * - Multiple ring, RSS support: As the OS changes towards offering more * interrupts or opting to participate in IRM, then you can more easily * offer RSS and related features. This should likely show up as a single * rx group with multiple rings and leverage the tx pseudo-group support. * * - TCP segmentation offload support: Right now the driver does not support * TSO. It'd potentially be a useful addition and help out folks. Fetching * information for TSO is in the tx data path right now. * * - FMA Support: Currently the driver does not rig up support for FMA. * Participating in that and more generally being able to reset the device * while it is operating in the face of fatal errors would be good. * * - TX stall detection: Related to the above, carefully designing a tx stall * detection and resetting the device when that happens would probably be * useful. * * - UFM support: Exposing the NVM and PBA (printed board assembly) through the * UFM subsystem would be a good thing to do. * * - Dynamic MTU changing: Right now the driver takes advantage of the * simplification of not allowing the MTU to change once the device has been * started. This isn't great, but it is far from the first (igb, e1000g, * ixgbe, etc.) to do this. It would be nice if this was lifted. */ #include #include #include #include #include #include #include #include #include #include #include "igc.h" /* * The core code expects the igc_mcast_raw to be a uint8_t packed array. We use * the ether_addr_t to make this a little more explicit and easy to reason * about, but that means we are relying on this size. */ CTASSERT(sizeof (ether_addr_t) == 6); uint32_t igc_read32(igc_t *igc, uint32_t reg) { uint32_t *addr; ASSERT3U(reg, <, igc->igc_regs_size); addr = (uint32_t *)(igc->igc_regs_base + reg); return (ddi_get32(igc->igc_regs_hdl, addr)); } void igc_write32(igc_t *igc, uint32_t reg, uint32_t val) { uint32_t *addr; ASSERT3U(reg, <, igc->igc_regs_size); addr = (uint32_t *)(igc->igc_regs_base + reg); ddi_put32(igc->igc_regs_hdl, addr, val); } /* * Ask hardware if the link is up and ready. Note, this assumes that we're on a * copper phy and short circuits a few things. See igb_is_link_up() for what * this looks like for non-copper PHYs if that ever becomes relevant. */ static bool igc_link_up(igc_t *igc) { ASSERT(MUTEX_HELD(&igc->igc_lock)); /* * When the link is up, then the core code will clear the value below. * Otherwise we likely need to assume it's down. */ (void) igc_check_for_link(&igc->igc_hw); return (!igc->igc_hw.mac.get_link_status); } static void igc_intr_lsc(igc_t *igc) { link_state_t orig_state, new_state; uint32_t mmd_base; mutex_enter(&igc->igc_lock); orig_state = igc->igc_link_state; /* * Always force a check of the link. */ igc->igc_hw.mac.get_link_status = true; if (igc_link_up(igc)) { uint16_t duplex = 0; (void) igc_get_speed_and_duplex(&igc->igc_hw, &igc->igc_link_speed, &duplex); switch (duplex) { case HALF_DUPLEX: igc->igc_link_duplex = LINK_DUPLEX_HALF; break; case FULL_DUPLEX: igc->igc_link_duplex = LINK_DUPLEX_FULL; break; default: igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; break; } igc->igc_link_state = LINK_STATE_UP; } else { igc->igc_link_state = LINK_STATE_DOWN; igc->igc_link_speed = 0; igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; } new_state = igc->igc_link_state; /* * Next, grab a bunch of information from the PHY for future us. */ (void) igc_read_phy_reg(&igc->igc_hw, PHY_CONTROL, &igc->igc_phy_ctrl); (void) igc_read_phy_reg(&igc->igc_hw, PHY_STATUS, &igc->igc_phy_status); (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_ADV, &igc->igc_phy_an_adv); (void) igc_read_phy_reg(&igc->igc_hw, PHY_LP_ABILITY, &igc->igc_phy_lp); (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_EXP, &igc->igc_phy_an_exp); (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_CTRL, &igc->igc_phy_1000t_ctrl); (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_STATUS, &igc->igc_phy_1000t_status); (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS, &igc->igc_phy_ext_status); (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS, &igc->igc_phy_ext_status); mmd_base = STANDARD_AN_REG_MASK << MMD_DEVADDR_SHIFT; (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_CTRL, &igc->igc_phy_mmd_ctrl); (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_STS1, &igc->igc_phy_mmd_sts); mutex_exit(&igc->igc_lock); if (orig_state != new_state) { mac_link_update(igc->igc_mac_hdl, new_state); } } static uint_t igc_intr_rx_queue(caddr_t arg1, caddr_t arg2) { igc_t *igc = (igc_t *)arg1; uintptr_t queue = (uintptr_t)arg2; igc_rx_ring_t *ring; mblk_t *mp = NULL; ASSERT3U(queue, <, igc->igc_nrx_rings); ring = &igc->igc_rx_rings[queue]; mutex_enter(&ring->irr_lock); if ((ring->irr_flags & IGC_RXR_F_POLL) == 0) { mp = igc_ring_rx(ring, IGC_RX_POLL_INTR); } mutex_exit(&ring->irr_lock); if (mp != NULL) { mac_rx_ring(igc->igc_mac_hdl, ring->irr_rh, mp, ring->irr_gen); } return (DDI_INTR_CLAIMED); } static uint_t igc_intr_tx_other(caddr_t arg1, caddr_t arg2) { igc_t *igc = (igc_t *)arg1; uint32_t icr = igc_read32(igc, IGC_ICR); igc_tx_recycle(igc, &igc->igc_tx_rings[0]); if ((icr & IGC_ICR_LSC) != 0) { igc_intr_lsc(igc); } return (DDI_INTR_CLAIMED); } static bool igc_setup_regs(igc_t *igc) { int ret; ddi_device_acc_attr_t da; if (pci_config_setup(igc->igc_dip, &igc->igc_cfgspace) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to map config space"); return (false); } if (ddi_dev_regsize(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_size) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get BAR %u size", IGC_PCI_BAR - 1); return (false); } bzero(&da, sizeof (ddi_device_acc_attr_t)); da.devacc_attr_version = DDI_DEVICE_ATTR_V1; da.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC; da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; da.devacc_attr_access = DDI_DEFAULT_ACC; if ((ret = ddi_regs_map_setup(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_base, 0, igc->igc_regs_size, &da, &igc->igc_regs_hdl)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to map registers: %d", ret); return (false); } return (true); } /* * Go through the process of initializing the igc core code. First we have to * fill in the information that the common code requires to identify the * hardware and set the mac type. After that we can go through and set up all of * the function initialization. */ static bool igc_core_code_init(igc_t *igc) { int ret; int *regs; uint_t nprop; igc->igc_hw.back = igc; igc->igc_hw.vendor_id = pci_config_get16(igc->igc_cfgspace, PCI_CONF_VENID); igc->igc_hw.device_id = pci_config_get16(igc->igc_cfgspace, PCI_CONF_DEVID); igc->igc_hw.revision_id = pci_config_get8(igc->igc_cfgspace, PCI_CONF_REVID); igc->igc_hw.subsystem_vendor_id = pci_config_get16(igc->igc_cfgspace, PCI_CONF_SUBVENID); igc->igc_hw.subsystem_device_id = pci_config_get16(igc->igc_cfgspace, PCI_CONF_SUBSYSID); if ((ret = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, igc->igc_dip, DDI_PROP_DONTPASS, "reg", ®s, &nprop)) != DDI_PROP_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to look up 'reg' " "property: %d", ret); return (false); } /* * We fill out the function and command word. We currently don't fill * out the bus type, speed, and width as it's not used by the common * code, leaving it all at unknown. We can grab that information when it * needs it. We do fill out the function and command word as the former * is important and the latter is easy to grab. */ igc->igc_hw.bus.func = PCI_REG_FUNC_G(regs[0]); igc->igc_hw.bus.pci_cmd_word = pci_config_get16(igc->igc_cfgspace, PCI_CONF_COMM); ddi_prop_free(regs); /* * The common code asks for the memory mapped address to be set in its * structure. Though in theory it promises not to use it. */ igc->igc_hw.hw_addr = (uint8_t *)igc->igc_regs_base; if ((ret = igc_set_mac_type(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to set mac type: %d", ret); return (false); } if ((ret = igc_setup_init_funcs(&igc->igc_hw, true)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to setup core code " "function pointers: %d", ret); return (false); } /* * Go ahead and attempt to get the bus information even though this * doesn't actually do anything right now. */ if ((ret = igc_get_bus_info(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "core code failed to get bus " "info: %d", ret); return (false); } return (true); } static bool igc_limits_init(igc_t *igc) { switch (igc->igc_hw.mac.type) { case igc_i225: igc->igc_limits.il_max_rx_rings = IGC_MAX_RX_RINGS_I225; igc->igc_limits.il_max_tx_rings = IGC_MAX_RX_RINGS_I225; igc->igc_limits.il_max_mtu = IGC_MAX_MTU_I225; break; default: dev_err(igc->igc_dip, CE_WARN, "unknown MAC type: %u", igc->igc_hw.mac.type); return (false); } return (true); } /* * Determine the hardware buffer sizes that are required for the given MTU. * There are a few different constraints that we try to enforce here that come * from the hardware and others that come from us: * * 1) The hardware requires that the rx and tx sizes all be 1 KiB (0x400) byte * aligned. * 2) Our tx engine can handle copying across multiple descriptors, so we cap * the maximum tx buffer size at one page. * 3) Right now our rx engine does not handle scanning multiple buffers for rx * (see the theory statement), so we end up making the rx buffer have to fix the * maximum frame size. * 4) rx buffers need to also account for IP alignment, so we make sure to * allocate extra bytes for that. */ void igc_hw_buf_update(igc_t *igc) { unsigned long pagesize = ddi_ptob(igc->igc_dip, 1); uint32_t tx_mtu; igc->igc_max_frame = igc->igc_mtu + sizeof (struct ether_vlan_header) + ETHERFCSL; igc->igc_rx_buf_size = P2ROUNDUP_TYPED(igc->igc_max_frame + IGC_RX_BUF_IP_ALIGN, IGC_BUF_ALIGN, uint32_t); tx_mtu = P2ROUNDUP_TYPED(igc->igc_max_frame, IGC_BUF_ALIGN, uint32_t); igc->igc_tx_buf_size = MIN(tx_mtu, pagesize); } static bool igc_intr_init(igc_t *igc) { int ret, types, nintrs, navail, req; const int min_nintrs = 2; if ((ret = ddi_intr_get_supported_types(igc->igc_dip, &types)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get supported " "interrupts: %d", ret); return (false); } /* * For now, we simplify our lives and device support by only supporting * MSI-X interrupts. When we find versions of this without MSI-X * support, we can go and add what we need. */ if ((types & DDI_INTR_TYPE_MSIX) == 0) { dev_err(igc->igc_dip, CE_WARN, "device does not support MSI-X, " "found %d", types); return (false); } if ((ret = ddi_intr_get_nintrs(igc->igc_dip, DDI_INTR_TYPE_MSIX, &nintrs)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get number of " "supported MSI-X interrupts: %d", ret); return (false); } if (nintrs < min_nintrs) { dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires " "%d MSI-X interrupts be supported, found %d", min_nintrs, nintrs); return (false); } if ((ret = ddi_intr_get_navail(igc->igc_dip, DDI_INTR_TYPE_MSIX, &navail)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get number of " "available MSI-X interrupts: %d", ret); return (false); } if (navail < min_nintrs) { dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires " "%d MSI-X interrupts be available, found %d", min_nintrs, navail); return (false); } /* * In the future this could be based upon the multiple queues that the * device supports, but for now it's limited to two. See 'Rings and * Interrupts' in the theory statement for more background. */ req = min_nintrs; req = MIN(req, navail); igc->igc_intr_size = req * sizeof (ddi_intr_handle_t); igc->igc_intr_handles = kmem_alloc(igc->igc_intr_size, KM_SLEEP); if ((ret = ddi_intr_alloc(igc->igc_dip, igc->igc_intr_handles, DDI_INTR_TYPE_MSIX, 0, req, &igc->igc_nintrs, DDI_INTR_ALLOC_NORMAL)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to allocate interrupts: " "%d", ret); return (false); } igc->igc_intr_type = DDI_INTR_TYPE_MSIX; igc->igc_attach |= IGC_ATTACH_INTR_ALLOC; if (igc->igc_nintrs < min_nintrs) { dev_err(igc->igc_dip, CE_WARN, "received %d interrupts, but " "needed at least %d", igc->igc_nintrs, min_nintrs); return (false); } if ((ret = ddi_intr_get_pri(igc->igc_intr_handles[0], &igc->igc_intr_pri)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt " "priority: %d", ret); return (false); } if ((ret = ddi_intr_get_cap(igc->igc_intr_handles[0], &igc->igc_intr_cap)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt " "capabilities: %d", ret); return (false); } return (true); } /* * As part of allocating our rings we make the following assumptions about * interrupt assignments. All tx rings share interrupt 0. All rx rings have * separate interrupts starting from interrupt 1. This design may likely change * in the face of actual multi-ring support */ static bool igc_rings_alloc(igc_t *igc) { uint32_t intr = 0; igc->igc_tx_rings = kmem_zalloc(sizeof (igc_tx_ring_t) * igc->igc_ntx_rings, KM_SLEEP); for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { igc->igc_tx_rings[i].itr_igc = igc; igc->igc_tx_rings[i].itr_idx = i; igc->igc_tx_rings[i].itr_intr_idx = intr; mutex_init(&igc->igc_tx_rings[i].itr_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri)); if (!igc_tx_ring_stats_init(igc, &igc->igc_tx_rings[i])) { return (false); } } igc->igc_rx_rings = kmem_zalloc(sizeof (igc_rx_ring_t) * igc->igc_nrx_rings, KM_SLEEP); intr = 1; for (uint32_t i = 0; i < igc->igc_nrx_rings; i++, intr++) { igc->igc_rx_rings[i].irr_igc = igc; igc->igc_rx_rings[i].irr_idx = i; igc->igc_rx_rings[i].irr_intr_idx = intr; mutex_init(&igc->igc_rx_rings[i].irr_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri)); mutex_init(&igc->igc_rx_rings[i].irr_free_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri)); cv_init(&igc->igc_rx_rings[i].irr_free_cv, NULL, CV_DRIVER, NULL); if (!igc_rx_ring_stats_init(igc, &igc->igc_rx_rings[i])) { return (false); } } ASSERT3U(intr, ==, igc->igc_nintrs); return (true); } /* * Allocate our interrupts. Note, we have more or less constrained the device * right now to only request two interrupts which we use in a fixed way. If we * end up with more varied queue support then this should be changed around. */ static bool igc_intr_hdlr_init(igc_t *igc) { int ret; if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[0], igc_intr_tx_other, igc, NULL)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to add tx/other " "interrupt handler: %d", ret); return (false); } if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[1], igc_intr_rx_queue, igc, (uintptr_t)0)) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to add rx interrupt " "handler: %d", ret); if ((ret = ddi_intr_remove_handler(igc->igc_intr_handles[0])) != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to remove " "tx/other interrupt handler"); } return (false); } return (true); } static void igc_hw_control(igc_t *igc, bool take) { uint32_t ctrl = igc_read32(igc, IGC_CTRL_EXT); if (take) { ctrl |= IGC_CTRL_EXT_DRV_LOAD; } else { ctrl &= ~IGC_CTRL_EXT_DRV_LOAD; } igc_write32(igc, IGC_CTRL_EXT, ctrl); } /* * Basic device initialization and sanity check. This covers that we can * properly reset the device, validate its checksum, and get a valid MAC * address. */ static bool igc_hw_init(igc_t *igc) { int ret; uint32_t eecd; if ((ret = igc_reset_hw(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d", ret); return (false); } /* * Goodbye firmware. */ igc_hw_control(igc, true); /* * Check the NVM validiity if a device is present. */ eecd = igc_read32(igc, IGC_EECD); if ((eecd & IGC_EECD_EE_DET) != 0) { if ((ret = igc_validate_nvm_checksum(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to validate " "igc NVM checksum: %d", ret); return (false); } } if ((ret = igc_read_mac_addr(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to read MAC address: %d", ret); return (false); } if ((ret = igc_get_phy_id(&igc->igc_hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to get PHY id: %d", ret); return (false); } return (true); } /* * In case the user has modified the LED state through MAC_CAPAB_LED, restore * that back to the defaults we got when we started up the device. */ static void igc_led_fini(igc_t *igc) { igc_write32(igc, IGC_LEDCTL, igc->igc_ledctl); } /* * Traditionally the Intel NIC drivers avoid touching activity pins as part of * their behavior for what we use. We also don't touch a pin if it's in SDP mode * and not being used to drive an LED as it means it's likely not for us. */ static bool igc_led_ignore(i225_led_mode_t mode) { switch (mode) { case I225_LED_M_FILTER_ACT: case I225_LED_M_LINK_ACT: case I225_LED_M_SDP: case I225_LED_M_PAUSE: case I225_LED_M_ACT: return (true); default: return (false); } } static inline uint32_t igc_led_bitoff(uint32_t led) { VERIFY3U(led, <, 3); return (led * 8); } static inline uint32_t igc_led_get_mode(uint32_t led, uint32_t reg) { uint32_t off = igc_led_bitoff(led); return (bitx32(reg, 3 + off, off)); } static inline uint32_t igc_led_set_mode(uint32_t led, uint32_t reg, i225_led_mode_t mode) { uint32_t off = igc_led_bitoff(led); return (bitset32(reg, 3 + off, off, mode)); } static inline uint32_t igc_led_get_ivrt(uint32_t led, uint32_t reg) { uint32_t off = igc_led_bitoff(led) + 6; return (bitx32(reg, off, off)); } static inline uint32_t igc_led_set_blink(uint32_t led, uint32_t reg, bool en) { uint32_t off = igc_led_bitoff(led) + 7; return (bitset32(reg, off, off, en)); } /* * There are three LEDs on the chip. The reference defines LED0 for 1 GbE link * up, LED1 for a 2.5GbE link up, and LED 2 for activity. However, this is all * controllable in the NVM so we shouldn't assume that these have any of their * default values. We instead read the LEDCTL register to see how it was set up * by default (though the NVM would likely be better). We then create pre-canned * LEDCTL register values for on, off, and default. See igc_osdep.h for some of * the caveats in definitions here. Note, we only tweak the non-activity LEDs * and if an LED has been indicated that it's being used for SDP, we don't touch * it. */ static void igc_led_init(igc_t *igc) { uint32_t led = igc_read32(igc, IGC_LEDCTL); igc->igc_ledctl = led; igc->igc_ledctl_on = led; igc->igc_ledctl_off = led; igc->igc_ledctl_blink = led; for (uint32_t i = 0; i < IGC_I225_NLEDS; i++) { i225_led_mode_t mode = igc_led_get_mode(i, led); if (!igc_led_ignore(mode)) { /* * If the inversion logic is on, that changes what the * on and off modes mean, so we need to change how we * set that appropriately. */ if (igc_led_get_ivrt(i, led) != 0) { igc->igc_ledctl_on = igc_led_set_mode(i, igc->igc_ledctl_on, I225_LED_M_OFF); igc->igc_ledctl_off = igc_led_set_mode(i, igc->igc_ledctl_off, I225_LED_M_ON); igc->igc_ledctl_blink = igc_led_set_mode(i, igc->igc_ledctl_blink, I225_LED_M_OFF); } else { igc->igc_ledctl_on = igc_led_set_mode(i, igc->igc_ledctl_on, I225_LED_M_ON); igc->igc_ledctl_off = igc_led_set_mode(i, igc->igc_ledctl_off, I225_LED_M_OFF); igc->igc_ledctl_blink = igc_led_set_mode(i, igc->igc_ledctl_blink, I225_LED_M_ON); } } igc->igc_ledctl_blink = igc_led_set_blink(i, igc->igc_ledctl_blink, true); } igc->igc_led_mode = MAC_LED_DEFAULT; } static void igc_write_ivar(igc_t *igc, uint32_t queue, bool rx, uint32_t msix) { const uint32_t ivarno = queue >> 1; const uint32_t reg = IGC_IVAR0 + ivarno * 4; const uint32_t val = msix | IGC_IVAR_VALID; uint32_t bitoff, bitend, ivar; if (rx) { if ((queue % 2) == 0) { bitoff = IGC_IVAR_RX0_START; } else { bitoff = IGC_IVAR_RX1_START; } } else { if ((queue % 2) == 0) { bitoff = IGC_IVAR_TX0_START; } else { bitoff = IGC_IVAR_TX1_START; } } bitend = bitoff + IGC_IVAR_ENT_LEN - 1; ivar = igc_read32(igc, reg); ivar = bitset32(ivar, bitend, bitoff, val); igc_write32(igc, reg, ivar); igc->igc_eims |= 1 << msix; } /* * Here we need to go through and initialize the hardware's notion of how * interrupts are mapped to causes. The device must be specifically enabled for * MSI-X and then this is also where we go ensure that all of our interrupt * coalescing is properly enabled. Note, we must first touch the GPIE register * to enable MSI-X settings otherwise later settings won't do anything. */ static void igc_hw_intr_init(igc_t *igc) { uint32_t gpie, ivar; gpie = IGC_GPIE_NSICR | IGC_GPIE_MSIX_MODE | IGC_GPIE_EIAME | IGC_GPIE_PBA; igc_write32(igc, IGC_GPIE, gpie); /* * Other causes are always explicitly mapped to cause 0. Each ring then * has its own mapping. In the MISC IVAR, these start at bit 8. We leave * the '0 |' out below just to avoid a compiler complaining. We also * must unamsk this interrupt cause, which is in bit 0. */ ivar = IGC_IVAR_VALID << 8; igc_write32(igc, IGC_IVAR_MISC, ivar); igc->igc_eims = 1; /* * There are a few IVAR registers available in hardware. Each IVAR * register handles mapping a given queue to an MSI-X. Each IVAR handles * two queues. */ for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { igc_write_ivar(igc, i, false, igc->igc_tx_rings[i].itr_intr_idx); } for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { igc_write_ivar(igc, i, true, igc->igc_rx_rings[i].irr_intr_idx); } for (uint32_t i = 0; i < igc->igc_nintrs; i++) { igc_write32(igc, IGC_EITR(i), igc->igc_eitr); } } /* * Synchronize our sense of the unicast table over to the device. If this is the * first time that we're here due to attach, we need to go through and allocate * the tracking table. */ static void igc_unicast_sync(igc_t *igc) { ASSERT(MUTEX_HELD(&igc->igc_lock)); if (igc->igc_ucast == NULL) { igc->igc_nucast = igc->igc_hw.mac.rar_entry_count; igc->igc_ucast = kmem_zalloc(sizeof (igc_addr_t) * igc->igc_nucast, KM_SLEEP); } for (uint16_t i = 0; i < igc->igc_nucast; i++) { int ret = igc_rar_set(&igc->igc_hw, igc->igc_ucast[i].ia_mac, i); /* * Common code today guarantees this can't fail. Put this here * to ensure to guard against future updates. */ VERIFY3S(ret, ==, IGC_SUCCESS); } } /* * The core code interface to the multicast table requires us to give them a * packed uint8_t array that they manually walk through in ETHERADDRL (6 byte) * chunks. This must be packed. To deal with this we opt to preserve a normal * list of multicast addresses and then a secondary version that's serialized as * the core code wants it. We allocate the memory for this secondary version at * the start. */ void igc_multicast_sync(igc_t *igc) { uint16_t nvalid; ASSERT(MUTEX_HELD(&igc->igc_lock)); if (igc->igc_mcast == NULL) { igc->igc_nmcast = igc->igc_hw.mac.mta_reg_count; igc->igc_mcast = kmem_zalloc(sizeof (igc_addr_t) * igc->igc_nmcast, KM_SLEEP); igc->igc_mcast_raw = kmem_alloc(sizeof (ether_addr_t) * igc->igc_nmcast, KM_SLEEP); } bzero(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast); nvalid = 0; for (uint16_t i = 0; i < igc->igc_nmcast; i++) { ether_addr_t *targ = &igc->igc_mcast_raw[nvalid]; if (!igc->igc_mcast[i].ia_valid) continue; bcopy(igc->igc_mcast[i].ia_mac, targ, sizeof (ether_addr_t)); nvalid++; } igc_update_mc_addr_list(&igc->igc_hw, (uint8_t *)igc->igc_mcast_raw, nvalid); } /* * This function is used to reinitialize the PBA, our various flow control * settings, reset hardware, ensure that the EEE, DPLU, and related power modes * are in the correct state. */ bool igc_hw_common_init(igc_t *igc) { int ret; uint32_t pba, hwm, hwmp, hwm2x; struct igc_hw *hw = &igc->igc_hw; /* * The PBA register determines which portion is used for the receive * buffers and which is used for the transmit buffers. This follows from * the I210 and reference drivers which use 34K as the default. We * currently leave the RXPBS and TXPBS at their power-on-reset defaults. * * We set the watermark based settings similar to igb, ensuring that we * have 16-byte granularity. The general guidelines from there was that * when it comes to automatic Ethernet PAUSE frame generation we should: * * - After an XOFF, you want to receive at least two frames. We use * whichever is smaller of 9/10ths and two frames. * - The low water mark apparently wants to be closer to the high water * mark. * * See igb_init_adapter() for more information. We basically use the * same calculation it did, given that the MAC is basically the same. */ pba = IGC_PBA_34K; hwmp = (pba << 10) * 9 / 10; hwm2x = (pba << 10) - 2 * igc->igc_max_frame; hwm = MIN(hwmp, hwm2x); hw->fc.high_water = hwm & 0xfffffff0; hw->fc.low_water = igc->igc_hw.fc.high_water - 16; /* * Use the suggested default pause time. */ hw->fc.pause_time = IGC_FC_PAUSE_TIME; hw->fc.send_xon = true; if ((ret = igc_reset_hw(hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d", ret); return (false); } if ((ret = igc_init_hw(hw)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to init hardware: %d", ret); return (false); } /* * Clear wake on LAN and set other power states. In addition, disable * EEE for now. */ igc_write32(igc, IGC_WUC, 0); if ((ret = igc_set_d0_lplu_state(hw, false)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to set D0 LPLU mode: %d", ret); return (false); } /* * There have been reports that enabling EEE for some 2.5G devices has * led to issues with the I225/226. It's not entirely clear, but we * default to disabling this like in igb/e1000g for now. */ if ((ret = igc_set_eee_i225(hw, false, false, false)) != IGC_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to set EEE mode: %d", ret); return (false); } igc_hw_intr_init(igc); mutex_enter(&igc->igc_lock); igc_unicast_sync(igc); igc_multicast_sync(igc); igc->igc_hw.mac.get_link_status = true; (void) igc_get_phy_info(hw); (void) igc_check_for_link(hw); mutex_exit(&igc->igc_lock); return (true); } static bool igc_intr_en(igc_t *igc) { int ret; if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) { ret = ddi_intr_block_enable(igc->igc_intr_handles, igc->igc_nintrs); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to block " "enable interrupts: %d", ret); return (false); } } else { for (int i = 0; i < igc->igc_nintrs; i++) { ret = ddi_intr_enable(igc->igc_intr_handles[i]); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to " "enable interrupt %d: %d", i, ret); for (int clean = 0; clean < i; clean++) { ret = ddi_intr_disable( igc->igc_intr_handles[clean]); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to disable " "interrupt %d while " "unwinding: %d", i, ret); } } return (false); } } } /* * Now that we've enabled interrupts here, clear any pending interrupts * and make sure hardware interrupts are enabled. */ (void) igc_read32(igc, IGC_ICR); return (true); } /* * Undo interrupt enablement. */ void igc_hw_intr_disable(igc_t *igc) { igc_write32(igc, IGC_EIMC, UINT32_MAX); igc_write32(igc, IGC_EIAC, 0); igc_write32(igc, IGC_IMC, UINT32_MAX); } /* * This is used during the GLDv3 mc_start(9E) entry point to enable interrupts * on the device itself. */ void igc_hw_intr_enable(igc_t *igc) { uint32_t ims; /* * First we clear pending interrupts. */ (void) igc_read32(igc, IGC_ICR); /* * The hardware has extended and non-extended interrupt masks and * auto-clear registers. We always disable auto-clear for the * non-extended portions. See the I210 datasheet 'Setting Interrupt * Registers' for a better sense of what's going on here. * * In the IMS register we always register link status change events and * device reset assertions. */ ims = IGC_IMS_LSC | IGC_IMS_DRSTA; igc_write32(igc, IGC_EIAC, igc->igc_eims); igc_write32(igc, IGC_EIMS, igc->igc_eims); igc_write32(igc, IGC_IMS, ims); igc_write32(igc, IGC_IAM, 0); } static void igc_cleanup(igc_t *igc) { if (igc->igc_mcast != NULL) { ASSERT3U(igc->igc_nmcast, !=, 0); kmem_free(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast); kmem_free(igc->igc_mcast, sizeof (igc_addr_t) * igc->igc_nmcast); igc->igc_nmcast = 0; igc->igc_mcast = NULL; } if (igc->igc_ucast != NULL) { ASSERT3U(igc->igc_nucast, !=, 0); kmem_free(igc->igc_ucast, sizeof (igc_addr_t) * igc->igc_nucast); igc->igc_nucast = 0; igc->igc_ucast = NULL; } if ((igc->igc_attach & IGC_ATTACH_INTR_EN) != 0) { int ret; if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) { ret = ddi_intr_block_disable(igc->igc_intr_handles, igc->igc_nintrs); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed to " "block disable interrupts: %d", ret); } } else { for (int i = 0; i < igc->igc_nintrs; i++) { ret = ddi_intr_disable( igc->igc_intr_handles[i]); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "failed " "to disable interrupt %d: %d", i, ret); } } } igc->igc_attach &= ~IGC_ATTACH_INTR_EN; } if ((igc->igc_attach & IGC_ATTACH_MAC) != 0) { int ret = mac_unregister(igc->igc_mac_hdl); if (ret != 0) { dev_err(igc->igc_dip, CE_WARN, "failed to unregister " "MAC handle: %d", ret); } igc->igc_attach &= ~IGC_ATTACH_MAC; } if ((igc->igc_attach & IGC_ATTACH_STATS) != 0) { igc_stats_fini(igc); igc->igc_attach &= ~IGC_ATTACH_STATS; } if ((igc->igc_attach & IGC_ATTACH_LED) != 0) { igc_led_fini(igc); igc->igc_attach &= ~IGC_ATTACH_LED; } if ((igc->igc_attach & IGC_ATTACH_INTR_HANDLER) != 0) { for (int i = 0; i < igc->igc_nintrs; i++) { int ret = ddi_intr_remove_handler(igc->igc_intr_handles[i]); if (ret != 0) { dev_err(igc->igc_dip, CE_WARN, "failed to " "remove interrupt %d handler: %d", i, ret); } } igc->igc_attach &= ~IGC_ATTACH_INTR_HANDLER; } if (igc->igc_tx_rings != NULL) { for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { igc_tx_ring_stats_fini(&igc->igc_tx_rings[i]); mutex_destroy(&igc->igc_tx_rings[i].itr_lock); } kmem_free(igc->igc_tx_rings, sizeof (igc_tx_ring_t) * igc->igc_ntx_rings); igc->igc_tx_rings = NULL; } if (igc->igc_rx_rings != NULL) { for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { igc_rx_ring_stats_fini(&igc->igc_rx_rings[i]); cv_destroy(&igc->igc_rx_rings[i].irr_free_cv); mutex_destroy(&igc->igc_rx_rings[i].irr_free_lock); mutex_destroy(&igc->igc_rx_rings[i].irr_lock); } kmem_free(igc->igc_rx_rings, sizeof (igc_rx_ring_t) * igc->igc_nrx_rings); igc->igc_rx_rings = NULL; } if ((igc->igc_attach & IGC_ATTACH_MUTEX) != 0) { mutex_destroy(&igc->igc_lock); igc->igc_attach &= ~IGC_ATTACH_MUTEX; } if ((igc->igc_attach & IGC_ATTACH_INTR_ALLOC) != 0) { for (int i = 0; i < igc->igc_nintrs; i++) { int ret = ddi_intr_free(igc->igc_intr_handles[i]); if (ret != DDI_SUCCESS) { dev_err(igc->igc_dip, CE_WARN, "unexpected " "failure freeing interrupt %d: %d", i, ret); } } igc->igc_attach &= ~IGC_ATTACH_INTR_ALLOC; } if (igc->igc_intr_handles != NULL) { ASSERT3U(igc->igc_intr_size, !=, 0); kmem_free(igc->igc_intr_handles, igc->igc_intr_size); } /* * Now that we're almost done, begrudgingly let firmware know we're * done. */ igc_hw_control(igc, false); if (igc->igc_regs_hdl != NULL) { ddi_regs_map_free(&igc->igc_regs_hdl); igc->igc_regs_base = NULL; } if (igc->igc_cfgspace != NULL) { pci_config_teardown(&igc->igc_cfgspace); } igc->igc_attach &= ~IGC_ATTACH_REGS; ddi_set_driver_private(igc->igc_dip, NULL); igc->igc_dip = NULL; VERIFY0(igc->igc_attach); kmem_free(igc, sizeof (igc_t)); } static int igc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { igc_t *igc; if (cmd != DDI_ATTACH) { return (DDI_FAILURE); } igc = kmem_zalloc(sizeof (igc_t), KM_SLEEP); ddi_set_driver_private(dip, igc); igc->igc_dip = dip; /* * Initialize a few members that are not zero-based. */ igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; igc->igc_link_state = LINK_STATE_UNKNOWN; /* * Set up all the register spaces that hardware requires. */ if (!igc_setup_regs(igc)) { goto err; } igc->igc_attach |= IGC_ATTACH_REGS; /* * Setup the common code. */ if (!igc_core_code_init(igc)) { goto err; } if (!igc_limits_init(igc)) { goto err; } /* * Go allocate and set up all of our interrupts. */ if (!igc_intr_init(igc)) { goto err; } /* * Initialize our main mutex for the device now that we have an * interrupt priority. */ mutex_init(&igc->igc_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri)); igc->igc_attach |= IGC_ATTACH_MUTEX; /* * We now want to determine the total number of rx and tx rings that we * have based on our interrupt allocation so we can go through and * perform the rest of the device setup that is required. The various * queues that we have are mapped to a given MSI-X through the IVAR * registers in the device. There is also an IVAR_MISC register that * maps link state change events and other issues up to two vectors. * * There isn't strictly per-queue interrupt generation control. Instead, * when in MSI-X mode, the device has an extended interrupt cause and * mask register. The mask register allows us to mask the five bits * described above. * * Because of all this we end up limiting the number of queues that we * use to 2 for now: 1 for tx and 1 for rx. Interrupt 0 is for tx/other * and 1 for rx. */ igc->igc_nrx_rings = 1; igc->igc_ntx_rings = 1; /* * Default to a 1500 byte MTU. */ igc->igc_mtu = ETHERMTU; igc_hw_buf_update(igc); /* * Initialize default descriptor limits and thresholds. We allocate 1.5 * times the number of rx descriptors so that way we can loan up to * 1/3rd of them. We allocate an even number of tx descriptors. */ igc->igc_rx_ndesc = IGC_DEF_RX_RING_SIZE; igc->igc_tx_ndesc = IGC_DEF_TX_RING_SIZE; igc->igc_rx_nbuf = igc->igc_rx_ndesc + (igc->igc_rx_ndesc >> 1); igc->igc_tx_nbuf = igc->igc_tx_ndesc; igc->igc_rx_nfree = igc->igc_rx_nbuf - igc->igc_rx_ndesc; igc->igc_rx_intr_nframes = IGC_DEF_RX_RING_INTR_LIMIT; igc->igc_rx_bind_thresh = IGC_DEF_RX_BIND; igc->igc_tx_bind_thresh = IGC_DEF_TX_BIND; igc->igc_tx_notify_thresh = IGC_DEF_TX_NOTIFY_MIN; igc->igc_tx_recycle_thresh = IGC_DEF_TX_RECYCLE_MIN; igc->igc_tx_gap = IGC_DEF_TX_GAP; igc->igc_eitr = IGC_DEF_EITR; if (!igc_rings_alloc(igc)) { goto err; } if (!igc_intr_hdlr_init(igc)) { goto err; } igc->igc_attach |= IGC_ATTACH_INTR_HANDLER; /* * Next reset the device before we begin initializing anything else. As * part of this, validate the flash checksum if present. This is all * initialization that we would only do once per device. Other * initialization that we want to do after any reset is done is * igc_hw_common_init(). */ if (!igc_hw_init(igc)) { goto err; } igc_led_init(igc); igc->igc_attach |= IGC_ATTACH_LED; /* * Snapshot our basic settings that users can eventually control in the * device. We start with always enabling auto-negotiation and * advertising the basic supported speeds. The I225v1 does have * substantial problems with enabling 2.5G due to the fact that it * doesn't maintain a proper inter-packet gap. Despite that, we default * to enabling 2.5G for now as its supposedly not broken with all link * partners and the NVM. We also don't have a way of actually * identifying and mapping that to something in the driver today, * unfortunately. */ igc->igc_hw.mac.autoneg = true; igc->igc_hw.phy.autoneg_wait_to_complete = false; igc->igc_hw.phy.autoneg_advertised = IGC_DEFAULT_ADV; igc->igc_hw.fc.requested_mode = igc_fc_default; igc->igc_hw.fc.current_mode = igc_fc_default; if (!igc_hw_common_init(igc)) { goto err; } if (!igc_stats_init(igc)) { goto err; } igc->igc_attach |= IGC_ATTACH_STATS; /* * Register with MAC */ if (!igc_mac_register(igc)) { goto err; } igc->igc_attach |= IGC_ATTACH_MAC; /* * Enable interrupts and get going. */ if (!igc_intr_en(igc)) { goto err; } igc->igc_attach |= IGC_ATTACH_INTR_EN; return (DDI_SUCCESS); err: igc_cleanup(igc); return (DDI_FAILURE); } static int igc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { igc_t *igc; if (cmd != DDI_DETACH) { return (DDI_FAILURE); } igc = ddi_get_driver_private(dip); if (igc == NULL) { dev_err(dip, CE_WARN, "asked to detach, but missing igc_t"); return (DDI_FAILURE); } igc_cleanup(igc); return (DDI_SUCCESS); } static struct cb_ops igc_cb_ops = { .cb_open = nulldev, .cb_close = nulldev, .cb_strategy = nodev, .cb_print = nodev, .cb_dump = nodev, .cb_read = nodev, .cb_write = nodev, .cb_ioctl = nodev, .cb_devmap = nodev, .cb_mmap = nodev, .cb_segmap = nodev, .cb_chpoll = nochpoll, .cb_prop_op = ddi_prop_op, .cb_flag = D_MP, .cb_rev = CB_REV, .cb_aread = nodev, .cb_awrite = nodev }; static struct dev_ops igc_dev_ops = { .devo_rev = DEVO_REV, .devo_refcnt = 0, .devo_getinfo = NULL, .devo_identify = nulldev, .devo_probe = nulldev, .devo_attach = igc_attach, .devo_detach = igc_detach, .devo_reset = nodev, .devo_quiesce = ddi_quiesce_not_supported, .devo_cb_ops = &igc_cb_ops }; static struct modldrv igc_modldrv = { .drv_modops = &mod_driverops, .drv_linkinfo = "Intel I226/226 Ethernet Controller", .drv_dev_ops = &igc_dev_ops }; static struct modlinkage igc_modlinkage = { .ml_rev = MODREV_1, .ml_linkage = { &igc_modldrv, NULL } }; int _init(void) { int ret; mac_init_ops(&igc_dev_ops, IGC_MOD_NAME); if ((ret = mod_install(&igc_modlinkage)) != 0) { mac_fini_ops(&igc_dev_ops); } return (ret); } int _info(struct modinfo *modinfop) { return (mod_info(&igc_modlinkage, modinfop)); } int _fini(void) { int ret; if ((ret = mod_remove(&igc_modlinkage)) == 0) { mac_fini_ops(&igc_dev_ops); } return (ret); }