1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 /* 17 * Intel I225/226 Ethernet Driver 18 * ------------------------------ 19 * 20 * This driver implements support for the Intel I225 and I226 Ethernet 21 * controllers which support up to 2.5 GbE and generally only supports BASE-T 22 * copper phys. This device is yet another variant on the venerable Intel 1 GbE 23 * devices that are found in e1000g(4D) and igb(4D). This is its own driver in 24 * part because that's how Intel did things and refactored their common code 25 * which we import and is found in the 'core' directory. 26 * 27 * There is not a good datasheet for the MAC that we've been able to find for 28 * this part. It's not clear that Intel even has a doc for this in their 29 * Resource and Design Center. The primary datasheet documents the NVM and other 30 * parts of it, but not the software interface. Based on observations from the 31 * common code we describe this as somewhat of an evolution of the I217 and 32 * I210, with less features than the I210, which comes from the server world 33 * (which ws itself a more stripped down I350). 34 * 35 * The result of all this is us trying to focus on what we know about this part 36 * and making some assumptions along the way. This includes things like: 37 * 38 * 1) We believe that the device only supports up to 4 RX and TX queues. 39 * 2) There is only one TX context for each TX queue and it is mapped to the 40 * queue. 41 * 3) There is no support for the head writeback modes that we've found. 42 * 4) This does otherwise support both the MSI-X and MSI/INTx interrupt 43 * management which are shaped very differently in the device. 44 * 5) The 2500BASE-T PHY support is unique, but the other PHY settings are 45 * roughly the same as far as we can tell. 46 * 47 * There are certainly more differences than the points up above, but the above 48 * are ones that generally influence our design. 49 * 50 * ------------ 51 * Organization 52 * ------------ 53 * 54 * This driver is first broken into two different pieces. There is the 'core' 55 * code which we import from Intel via FreeBSD. All of these sources are in the 56 * 'uts/common/io/igc/core' directory and we try our hardest to avoid modifying 57 * them (hence the smatch gags). The core code can be thought of as abstracting 58 * the MAC, NVM, and PHY across different chipsets (right now it's all the I225) 59 * and providing us with a series of library calls that we can do to manage the 60 * chip. 61 * 62 * The remaining files that sit alongside this one implement different portions 63 * of functionality related to the device. In particular: 64 * 65 * igc.[ch]: This is the main entry point for the driver and the 66 * source of this block comment. It implements all of the 67 * basic DDI entry points: attach and detach, interrupts, 68 * PCI config space and register set up and tear down. 69 * 70 * The header file contains all of the structure 71 * definitions that we use throughout this and the basic 72 * constants we use for sizing. 73 * 74 * igc_gld.c This file implements all of the major GLDv3 required 75 * entry points that are found in mac(9E). The guts of the 76 * actual I/O are in igc_ring.c, but getting and setting 77 * all of the various MAC properties and other bits is 78 * here. 79 * 80 * igc_osdep.[ch] The osdep (OS dependent) files, are used to define and 81 * implement the functionality required by the common code. 82 * igc_osdep.h is included in the build of each file. 83 * 84 * We have a second use for igc_osdep.h which is where we 85 * put missing hardware definitions that apply. This is for 86 * cases where the core code doesn't have it and it should 87 * really live in igc_defines.h or igc_regs.h, but we keep 88 * it here to avoid modifying those. 89 * 90 * igc_ring.c This implements the core I/O routines of the device 91 * driver, starting with the descriptor ring setup and tear 92 * down as well as DMA, descriptor ring, and per-frame 93 * memory. It also implements all of the primary logic for 94 * transmitting and receiving frames. 95 * 96 * igc_stat.c This file deals with kstat creation and destruction as 97 * well as reading and fetching all of the registers that 98 * exist in hardware. 99 * 100 * There are a few primary data structures to be aware of. Their relationships 101 * are shown in the following image and then described. Note, each structure has 102 * many more fields than those pictured: 103 * 104 * +---------------+ 105 * | dev_info_t * | 106 * | -|-+ 107 * | private data | | 108 * +---------------+ v 109 * +------------------------------+ +---------------------+ 110 * | igc_t | | igc_addr_t | 111 * | per-instance primary | +---->| | 112 * | structure | |+--->| Notes a MAC address | ... 113 * | | || | stored in hardware | 114 * | igc_addr_t *igc_ucast -|--+| +---------------------+ 115 * | igc_addr_t *igc_mcast -|---+ +---------------------------+ 116 * | struct igc_hw *igc_hw -|--------->| struct igc_hw (core code) | 117 * | igc_tx_ring_t *igc_tx_rings -|--+ | | 118 * | igc_rx_ring_t *igc_rx_rings -|--|---+ | igc_mac_info mac | 119 * +------------------------------+ | | | igc_fc_info fc | 120 * | | | igc_phy_info phy | 121 * +----------------------------------+ | | igc_nvm_info nvm | 122 * | v +---------------------------+ 123 * | +--------------------------------------+ 124 * | | igc_rx_ring_t | 125 * | | | 126 * | | igc_adv_rx_desc *irr_ring ---|--> rx hw descriptor ring 127 * | | uint32_t irr_next ---|--> next entry to look for data 128 * | | igc_rx_buffer_t **irr_work_list ---|--> corresponds to ring entries 129 * | | uint32_t irr_nfree ---|--> number of free list entries 130 * | | igc_rx_buffer_t **irr_free_list ---|--> set of buffers free for bind 131 * | | igc_rx_buffer_t *irr_arena ---|-+> array of all rx buffers 132 * | +--------------------------------------+ | 133 * | | 134 * | +----------------------------+ | 135 * | | igc_rx_buffer_t |<--+ 136 * | | | 137 * | | mblk_t *igb_mp -|---> mblk_t for rx buffer 138 * | | igc_dma_buffer_t irb_dma -|---> DMA memory for rx buffer 139 * | +----------------------------+ 140 * | 141 * | +------------------------------------+ 142 * +-->| igc_tx_ring_t | 143 * | | 144 * | icc_adv_tx_desc *itr_ring --|--> tx hw descriptor ring 145 * | uin32_t itr_ring_head --|--> next descriptor to recycle 146 * | uin32_t itr_ring_fail --|--> next descriptor to place 147 * | uin32_t itr_ring_free --|--> free descriptors in ring 148 * | igc_tx_buffer_t **itr_work_list |--> corresponds to ring entries 149 * | list_t itr_free_list --|--> available tx buffers 150 * | igc_tx_buffer_t *itr_arena --|-+> array of all tx buffers 151 * +------------------------------------+ | 152 * | 153 * +---------------------------------+ | 154 * | igc_tx_buffer_t |<-+ 155 * | | 156 * | mblk_t *itb_mp --|--> mblk to tx (only in first) 157 * | igc_dma_buffer_t itb_dma --|--> tx DMA buffer for copy 158 * | ddi_dma_handle_t itb_bind_hdl --|--> DMA handle for bind 159 * +---------------------------------+ 160 * 161 * igc_t This is the primary data structure that exists for each 162 * instance of the driver. There is generally a 1:1 163 * relationship between a physical port, an instance of the 164 * driver, and a PCI function. This structure provides 165 * access to the device's registers and it embeds the 166 * common code's struct igc_hw. 167 * 168 * struct igc_hw This structure is used by the core code and it contains 169 * information related to the MAC, PHY, NVM, and related 170 * information that the device uses. In general, this 171 * structure is used when performing API calls to the 172 * common code. The common code calls back into us in the 173 * igc_osdep.c interfaces. 174 * 175 * igc_tx_ring_t This structure represents a single transmit ring in 176 * hardware, its associated software state, and 177 * miscellaneous data like statistics, MAC handles, etc. 178 * See the 'TX Data Path Design' section for more 179 * information. 180 * 181 * igc_rx_ring_t This is the receive variant of a ring. It represents and 182 * tracks the hardware state along with all our metadata. 183 * One of these exists for each receive ring that we've 184 * enabled (currently one). See the 'RX Data Path Design' 185 * section for more information. 186 * 187 * igc_tx_buffer_t This represents a single tx buffer in the driver. A tx 188 * buffer contains DMA based storage that it can use to 189 * transmit a packet and contains a second DMA handle that 190 * can be used to bind a specific mblk_t to it. tx buffers 191 * are capped at the current page size and can be smaller 192 * if the maximum packet size is smaller. A 1500 byte MTU 193 * will end up with a 2 KiB buffer due to the device's 194 * internal alignment requirements. 195 * 196 * igc_rx_buffer_t This represents a single rx buffer in the driver. These 197 * buffers may be loaned up to MAC and then returned to us 198 * later. They contain a single DMA buffer which right now 199 * is a single contiguous buffer that fits the maximum 200 * packet size. Each buffer has a corresponding mblk_t that 201 * it is mapped to. 202 * 203 * igc_dma_buffer_t This represent a DMA buffer in the system. DMA buffers 204 * are used for transmit buffers, receive buffers, or 205 * various ring descriptor entries. The DMA buffer 206 * structure is not inherently limited to a specific number 207 * of cookies. It is always mapped in our virtual address 208 * space and encapsulates the various DDI functions. In 209 * general, one expects to interface with the idb_va member 210 * when needing to access the memory, the idb_size member 211 * when wanting to understand how much memory is in the 212 * buffer, and the idb_hdl member when needing to access 213 * the DMA cookies. 214 * 215 * igc_addr_t This represents a 48-bit Ethernet MAC address slot in 216 * the hardware that may or may not be used at any given 217 * point in time. 218 * 219 * -------------------- 220 * Rings and Interrupts 221 * -------------------- 222 * 223 * The I225/226 controller like the I210 supports up to 4 rx and tx rings. Due 224 * to the long history of this controller and its tradition from the e1000g/igb 225 * days and much older parts like the 8254x series, it has two entirely 226 * different sets of interrupt modes. One where MSI-X is used and a mode where 227 * a single MSI or INTx interrupt is used. Currently the driver only supports 228 * the MSI-X mode as that gives us more flexibility and due to the fact that the 229 * interrupt modes and register handling are different, reduces the complexity 230 * in the driver. 231 * 232 * The hardware uses its IVAR registers to map specific queues to interrupts. 233 * Each rx queue and tx queue is mapped to a specific bit position in the IVAR 234 * and there is an additional IVAR register for miscellaneous causes like link 235 * state changes. While the IVAR register allows for several bits for MSI-X 236 * entries, for the most part, it appears that there is only support for values 237 * in the range [0, 4] based on the I210 which we believe extends to the I225/6. 238 * 239 * MSI-X mode causes the device's various interrupt registers to be split into 240 * two groups the 'legacy' and 'extended' (sometimes called advanced) ones. The 241 * extended ones all start with 'E'. When in MSI-X mode, the EICR (cause), EICS 242 * (cause set), EIAC (auto-clear), EIMS (mask set) registers all operate with 243 * indexes that refer to the MSI-X. The primary way to temporarily disable 244 * interrupts for polling is to remove the given MSI-X from the auto-clear set 245 * and to clear it from the enabled mask set. 246 * 247 * The implication of all of this is that we can only really disable interrupts 248 * for polling on a per-MSI-X basis. This generally means that the design for 249 * interrupts and rings is that all the tx rings and the link state change 250 * events share interrupt 0, while rx rings use interrupts 1-4. Because the x86 251 * 'apix' modules end up defaulting to two interrupts to a driver, we end up 252 * only supporting a single rx and tx ring for the time being, though the driver 253 * is phrased in terms of a variable number of such rings. 254 * 255 * ------------------- 256 * RX Data Path Design 257 * ------------------- 258 * 259 * The rx data path is based around allocating a fixed number of receive buffers 260 * for each ring. We have two goals in the allocation buffer and ring design: 261 * 262 * 1) We want to make sure that the ring is always full of valid descriptors for 263 * rx to prevent stalls. One implication of this is that we will always 264 * refill a received buffer with a new one and notify the hardware that the 265 * buffer is usable again. 266 * 267 * 2) We would prefer to not have to copy received memory and instead bind the 268 * DMA memory directly into an mblk_t. 269 * 270 * To satisfy (1) we need to allocate at least as many rx buffers as there are 271 * ring entries. The ring is sized by default to 512 entries, which is a 272 * somewhat arbitrary, but common, size. We then say that we want to be able to 273 * loan half of our entries up the stack at any given time. This leads to us 274 * allocating 1.5x the ring size rx buffers. 275 * 276 * All of the rx buffers are stored in the irr_arena array. They are then split 277 * between the free list and the ring's work list. The work list is an array 278 * that is a 1:1 mapping to a location in the descriptor ring. That is index 4 279 * of the work list (irr_work_list[4]) corresponds to index 4 of the descriptor 280 * ring (irr_ring[4]). However, this may refer to any of the rx descriptors that 281 * is in the irr_arena. When we start up the ring, the first ring size entries 282 * are all inserted into the work list and then the remaining entries are 283 * inserted into the free list. 284 * 285 * Entries that are in the work list are always given to hardware. We track the 286 * next place for us to scan for received packets through the 'irr_next' index 287 * into the descriptor ring. When an interrupt fires, we start at irr_next and 288 * iterate through the descriptor ring continuing while we find valid, received 289 * packets. When we process a packet, we look at two things to consider whether 290 * we bind it or copy it to a new mblk_t. The first piece is the received 291 * packet's length. If the packet is small, there is not much value in binding 292 * it and instead we just allocate and copy a new buffer for the packet. 293 * 294 * The second is if there are free rx descriptors. To keep goal (1) valid, we 295 * only will loan a packet up if there is an entry on the free list that can 296 * replace the rx buffer, as otherwise we'd want to make sure we don't stall the 297 * ring. If an rx buffer is loaned, the entry on the free list takes its place 298 * in the descriptor ring and when the networking stack is finally done with the 299 * mblk_t, it'll be returned to us as part of the freemsg()/freeb() destructor. 300 * This lifetime is illustrated in the following diagram: 301 * 302 * 303 * +-------------+ +-----------+ 304 * | Work List |<---*-------------------| Free List | 305 * | Owned by HW | . . Used to replace | Idle | 306 * +-------------+ loaned buffers +-----------+ 307 * | | ^ ^ 308 * | | . . . Reused if a | 309 * | +-+ copy is done . . . Returned to driver via 310 * | | freemsg() which calls 311 * | | igc_rx_recycle(). 312 * v | 313 * +-------------------+ | 314 * | Loaned |------------------------+ 315 * | Owned by netstack | 316 * +-------------------+ 317 * 318 * Currently the rx data path uses rx buffers that are equal to the maximum size 319 * of a packet (rounded up based on hardware's 1 KiB alignment requirement). 320 * This was mostly done for initial simplicity, though it comes at a memory 321 * cost. It is possible to design this to be more like the tx subsystem where we 322 * use fixed page size buffers and just cons up an mblk_t chain with b_cont 323 * pointers. 324 * 325 * ------------------- 326 * TX Data Path Design 327 * ------------------- 328 * 329 * The tx data path is a bit different in design from the rx data path. When the 330 * system wants to tx data there are two fundamental building blocks that we 331 * use, both of which leverage the igc_tx_buffer_t: 332 * 333 * 1) We use the DMA memory that is allocated with the buffer and copy the 334 * mblk_t data into it. This is used when we have small mblk_t's. 335 * 336 * 2) We utilize the DMA handle that is in the tx buffer (but not the buffer's 337 * DMA memory) to perform DMA binding. This can result in multiple cookies 338 * and therefore descriptors mapping to the single buffer. 339 * 340 * Because a given tx buffer may end up using more than one descriptor and we 341 * have to account for transmit context descriptors, which are used for 342 * indicating checksum and segmentation offloads, we end up only allocating a 343 * number of transmit buffers equal to the ring size. In addition, the tx data 344 * buffer's maximum size is capped at the size of a single page. This is done 345 * because we often aren't going to be copying and if we are, we don't need that 346 * much more memory. The actual size may be smaller depending on the MTU. 347 * 348 * The tx descriptor ring is used in a bit of a different way. While part of the 349 * reason for this is that we are filling it based on the stack's demands and 350 * therefore only need to fill in descriptors when there's a need, the second 351 * reason is because of how the hardware reports back events. There are two 352 * major kinds of descriptors that can be entered into the ring. There are the 353 * aforementioned context descriptors and then data descriptors. While data 354 * descriptors support an interrupt on completion, context descriptors do not. 355 * 356 * When an mblk_t comes in to be transmitted, we walk all of the mblk_t's 357 * associated with it via the b_cont pointer. For each one, we look at the size 358 * of the data and determine whether or not to perform DMA binding or to copy it 359 * into the current tx buffer. A given tx buffer can be used to copy multiple 360 * different mblk_t's. Imagine a pathological case where we had a 500 byte 361 * packet split into 125 byte chunks, this would end up using a single tx data 362 * buffer. However, if you imagine a large chunk of TCP data, this may be 363 * spread across several mblk_t's so we may end up leveraging multiple tx data 364 * buffers. 365 * 366 * The transmit buffers that are available are stored on a free list. This is 367 * managed as a list_t as we end up needing to often track groups of descriptors 368 * to allocate and free across packet transmit and recycling. We don't count the 369 * number of transmit buffers that are free per se, but it generally tracks the 370 * number of free descriptors which do track as in the worst case there is a 1:1 371 * relationship between buffers and descriptors and more generally it's 1:n, 372 * that is there are multiple descriptors used for a single buffer. 373 * 374 * The transmit ring is managed through a combination of three integers, the 375 * itr_ring_head, the itr_ring_tail, and the itr_ring_free. The ring's tail 376 * represents the place where the driver will place new data to transmit. The 377 * ring's head represents the first place that we should check for a packet's 378 * completion when we're performing recycling (the act of acknowledging what 379 * hardware has processed internal to the driver) due to a tx interrupt or 380 * manual recycling in the transmit path. 381 * 382 * When placing a packet as a series of descriptor rings we'll end up doing the 383 * following: 384 * 385 * 1) First we determine how to map each mblk_t as mentioned above. 386 * 2) This will then be turned into descriptors in the ring. Each tx data buffer 387 * that is used is placed in the itr_work_list at the corresponding index 388 * that they are used in the ring. There is one special case here, if a 389 * context descriptor is used, the first transmit buffer will refer to the 390 * context descriptor's entry (which always comes before data). 391 * 3) We'll ensure that there are enough descriptors for this packet to fit into 392 * the ring or if it would exceed our mandatory gap threshold. If so, then 393 * we'll undo all the work we just did and return the mblk_t to MAC and 394 * indicate that the ring is blocked. MAC will be notified later when we free 395 * up transmit descriptors. 396 * 4) In the first transmit data buffer we'll store both the mblk_t and then 397 * we'll store what the index of the last descriptor that's used is. This is 398 * important for recycling. We also indicate that the last descriptor should 399 * be the one that reports its status on interrupt completion. 400 * 5) We'll notify hardware that there is data for it to transmit by writing to 401 * the ring's tail pointer. 402 * 403 * This all works reasonably okay, except for the small problem of the bill, 404 * which we pay off in the form of recycling. Recycling is going through the 405 * ring and seeing which descriptors are free. While the transmit path described 406 * above is the only path that is allowed to move the tail, the recycling path 407 * is the only one that's allowed to adjust the head. 408 * 409 * When we perform recycling we look at the current head and its corresponding 410 * tx buffer. There will always be a tx buffer in the same index in the 411 * itr_work_list[] unless a serious programmer error has occurred. This buffer 412 * will tell us what the index to check for completion is via its itb_last_desc 413 * member (only valid when itb_first is set to true). If this index indicates 414 * that it has been processed by hardware, then we process all entries between 415 * here and there. 416 * 417 * When we process descriptors, we bunch up the transmit descriptors and 418 * mblk_t's. We'll reset the transmit descriptor (freeing any DMA binding if 419 * used) and append the mblk_t if it exists to be freed in one large 420 * freemsgchain() at the end. The fact that we won't free any tx buffers 421 * associated with a packet until they're all done is important. This makes 422 * sure that any memory that we have bound from the mblk_t remains valid the 423 * entire time. 424 * 425 * If we have freed enough descriptors as part of this to allow mac to send data 426 * again, then once we have finished all processing and dropped the lock, we 427 * will notify MAC. 428 * 429 * When we are processing descriptors here we try to avoid holding the itr_lock 430 * except for the start and end of the process. This is an important way to 431 * ensure that we don't block transmits. Because of this, there can only be one 432 * thread performing a recycle at any given time between the interrupt path and 433 * the transmit path trying to clean up. This is maintained using the 434 * 'itr_recycle' boolean. If a recycle is already in progress then there's 435 * generally not much reason to perform one simultaneously and so the caller 436 * will just return. This is why the head (and thus returning descriptors) is 437 * only used by the recycle path. 438 * 439 * ------- 440 * Locking 441 * ------- 442 * 443 * Mutexes exist on three different structures in the driver: 444 * 445 * 1) igc_t (igc_lock) 446 * 2) igc_rx_ring_t (irr_lock, irr_free_lock) 447 * 3) igc_tx_ring_t (itr_lock) 448 * 449 * The following rules hold for locking in the driver: 450 * 451 * 1) One should not hold locks for both the rx rings and tx rings at the same 452 * time. If this is required, please determine if it is absolutely necessary. 453 * 2) You should always take the controller's lock ahead of any ring's locks. 454 * 3) The general rx ring lock (irr_lock) should be taken ahead of the free list 455 * lock (irr_free_lock) if both are required. 456 * 457 * ------------------- 458 * Future Improvements 459 * ------------------- 460 * 461 * This driver was initially written with an eye towards getting something that 462 * had broad use for folks with this hardware and not towards enabling every 463 * feature immediately. Here are some areas that can be improved upon in the 464 * driver. 465 * 466 * - Multiple ring, RSS support: As the OS changes towards offering more 467 * interrupts or opting to participate in IRM, then you can more easily 468 * offer RSS and related features. This should likely show up as a single 469 * rx group with multiple rings and leverage the tx pseudo-group support. 470 * 471 * - TCP segmentation offload support: Right now the driver does not support 472 * TSO. It'd potentially be a useful addition and help out folks. Fetching 473 * information for TSO is in the tx data path right now. 474 * 475 * - FMA Support: Currently the driver does not rig up support for FMA. 476 * Participating in that and more generally being able to reset the device 477 * while it is operating in the face of fatal errors would be good. 478 * 479 * - TX stall detection: Related to the above, carefully designing a tx stall 480 * detection and resetting the device when that happens would probably be 481 * useful. 482 * 483 * - UFM support: Exposing the NVM and PBA (printed board assembly) through the 484 * UFM subsystem would be a good thing to do. 485 * 486 * - Dynamic MTU changing: Right now the driver takes advantage of the 487 * simplification of not allowing the MTU to change once the device has been 488 * started. This isn't great, but it is far from the first (igb, e1000g, 489 * ixgbe, etc.) to do this. It would be nice if this was lifted. 490 */ 491 492 #include <sys/ddi.h> 493 #include <sys/sunddi.h> 494 #include <sys/conf.h> 495 #include <sys/devops.h> 496 #include <sys/modctl.h> 497 #include <sys/cmn_err.h> 498 #include <sys/pci.h> 499 #include <sys/sysmacros.h> 500 #include <sys/debug.h> 501 #include <sys/bitext.h> 502 503 #include "igc.h" 504 505 /* 506 * The core code expects the igc_mcast_raw to be a uint8_t packed array. We use 507 * the ether_addr_t to make this a little more explicit and easy to reason 508 * about, but that means we are relying on this size. 509 */ 510 CTASSERT(sizeof (ether_addr_t) == 6); 511 512 uint32_t 513 igc_read32(igc_t *igc, uint32_t reg) 514 { 515 uint32_t *addr; 516 ASSERT3U(reg, <, igc->igc_regs_size); 517 addr = (uint32_t *)(igc->igc_regs_base + reg); 518 return (ddi_get32(igc->igc_regs_hdl, addr)); 519 } 520 521 void 522 igc_write32(igc_t *igc, uint32_t reg, uint32_t val) 523 { 524 uint32_t *addr; 525 ASSERT3U(reg, <, igc->igc_regs_size); 526 addr = (uint32_t *)(igc->igc_regs_base + reg); 527 ddi_put32(igc->igc_regs_hdl, addr, val); 528 } 529 530 /* 531 * Ask hardware if the link is up and ready. Note, this assumes that we're on a 532 * copper phy and short circuits a few things. See igb_is_link_up() for what 533 * this looks like for non-copper PHYs if that ever becomes relevant. 534 */ 535 static bool 536 igc_link_up(igc_t *igc) 537 { 538 ASSERT(MUTEX_HELD(&igc->igc_lock)); 539 540 /* 541 * When the link is up, then the core code will clear the value below. 542 * Otherwise we likely need to assume it's down. 543 */ 544 (void) igc_check_for_link(&igc->igc_hw); 545 return (!igc->igc_hw.mac.get_link_status); 546 } 547 548 static void 549 igc_intr_lsc(igc_t *igc) 550 { 551 link_state_t orig_state, new_state; 552 uint32_t mmd_base; 553 554 mutex_enter(&igc->igc_lock); 555 orig_state = igc->igc_link_state; 556 557 /* 558 * Always force a check of the link. 559 */ 560 igc->igc_hw.mac.get_link_status = true; 561 if (igc_link_up(igc)) { 562 uint16_t duplex = 0; 563 564 (void) igc_get_speed_and_duplex(&igc->igc_hw, 565 &igc->igc_link_speed, &duplex); 566 567 switch (duplex) { 568 case HALF_DUPLEX: 569 igc->igc_link_duplex = LINK_DUPLEX_HALF; 570 break; 571 case FULL_DUPLEX: 572 igc->igc_link_duplex = LINK_DUPLEX_FULL; 573 break; 574 default: 575 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; 576 break; 577 } 578 igc->igc_link_state = LINK_STATE_UP; 579 } else { 580 igc->igc_link_state = LINK_STATE_DOWN; 581 igc->igc_link_speed = 0; 582 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; 583 } 584 new_state = igc->igc_link_state; 585 586 /* 587 * Next, grab a bunch of information from the PHY for future us. 588 */ 589 (void) igc_read_phy_reg(&igc->igc_hw, PHY_CONTROL, &igc->igc_phy_ctrl); 590 (void) igc_read_phy_reg(&igc->igc_hw, PHY_STATUS, &igc->igc_phy_status); 591 (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_ADV, 592 &igc->igc_phy_an_adv); 593 (void) igc_read_phy_reg(&igc->igc_hw, PHY_LP_ABILITY, 594 &igc->igc_phy_lp); 595 (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_EXP, 596 &igc->igc_phy_an_exp); 597 (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_CTRL, 598 &igc->igc_phy_1000t_ctrl); 599 (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_STATUS, 600 &igc->igc_phy_1000t_status); 601 (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS, 602 &igc->igc_phy_ext_status); 603 (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS, 604 &igc->igc_phy_ext_status); 605 606 mmd_base = STANDARD_AN_REG_MASK << MMD_DEVADDR_SHIFT; 607 (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_CTRL, 608 &igc->igc_phy_mmd_ctrl); 609 (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_STS1, 610 &igc->igc_phy_mmd_sts); 611 mutex_exit(&igc->igc_lock); 612 613 if (orig_state != new_state) { 614 mac_link_update(igc->igc_mac_hdl, new_state); 615 } 616 } 617 618 static uint_t 619 igc_intr_rx_queue(caddr_t arg1, caddr_t arg2) 620 { 621 igc_t *igc = (igc_t *)arg1; 622 uintptr_t queue = (uintptr_t)arg2; 623 igc_rx_ring_t *ring; 624 mblk_t *mp = NULL; 625 626 ASSERT3U(queue, <, igc->igc_nrx_rings); 627 ring = &igc->igc_rx_rings[queue]; 628 629 mutex_enter(&ring->irr_lock); 630 if ((ring->irr_flags & IGC_RXR_F_POLL) == 0) { 631 mp = igc_ring_rx(ring, IGC_RX_POLL_INTR); 632 } 633 mutex_exit(&ring->irr_lock); 634 635 if (mp != NULL) { 636 mac_rx_ring(igc->igc_mac_hdl, ring->irr_rh, mp, ring->irr_gen); 637 } 638 639 return (DDI_INTR_CLAIMED); 640 } 641 642 static uint_t 643 igc_intr_tx_other(caddr_t arg1, caddr_t arg2) 644 { 645 igc_t *igc = (igc_t *)arg1; 646 uint32_t icr = igc_read32(igc, IGC_ICR); 647 648 igc_tx_recycle(igc, &igc->igc_tx_rings[0]); 649 650 if ((icr & IGC_ICR_LSC) != 0) { 651 igc_intr_lsc(igc); 652 } 653 654 return (DDI_INTR_CLAIMED); 655 } 656 657 static bool 658 igc_setup_regs(igc_t *igc) 659 { 660 int ret; 661 ddi_device_acc_attr_t da; 662 663 if (pci_config_setup(igc->igc_dip, &igc->igc_cfgspace) != DDI_SUCCESS) { 664 dev_err(igc->igc_dip, CE_WARN, "failed to map config space"); 665 return (false); 666 } 667 668 if (ddi_dev_regsize(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_size) != 669 DDI_SUCCESS) { 670 dev_err(igc->igc_dip, CE_WARN, "failed to get BAR %u size", 671 IGC_PCI_BAR - 1); 672 return (false); 673 } 674 675 bzero(&da, sizeof (ddi_device_acc_attr_t)); 676 da.devacc_attr_version = DDI_DEVICE_ATTR_V1; 677 da.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC; 678 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 679 da.devacc_attr_access = DDI_DEFAULT_ACC; 680 681 if ((ret = ddi_regs_map_setup(igc->igc_dip, IGC_PCI_BAR, 682 &igc->igc_regs_base, 0, igc->igc_regs_size, &da, 683 &igc->igc_regs_hdl)) != DDI_SUCCESS) { 684 dev_err(igc->igc_dip, CE_WARN, "failed to map registers: %d", 685 ret); 686 return (false); 687 } 688 689 return (true); 690 } 691 692 /* 693 * Go through the process of initializing the igc core code. First we have to 694 * fill in the information that the common code requires to identify the 695 * hardware and set the mac type. After that we can go through and set up all of 696 * the function initialization. 697 */ 698 static bool 699 igc_core_code_init(igc_t *igc) 700 { 701 int ret; 702 int *regs; 703 uint_t nprop; 704 705 igc->igc_hw.back = igc; 706 igc->igc_hw.vendor_id = pci_config_get16(igc->igc_cfgspace, 707 PCI_CONF_VENID); 708 igc->igc_hw.device_id = pci_config_get16(igc->igc_cfgspace, 709 PCI_CONF_DEVID); 710 igc->igc_hw.revision_id = pci_config_get8(igc->igc_cfgspace, 711 PCI_CONF_REVID); 712 igc->igc_hw.subsystem_vendor_id = pci_config_get16(igc->igc_cfgspace, 713 PCI_CONF_SUBVENID); 714 igc->igc_hw.subsystem_device_id = pci_config_get16(igc->igc_cfgspace, 715 PCI_CONF_SUBSYSID); 716 717 if ((ret = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, igc->igc_dip, 718 DDI_PROP_DONTPASS, "reg", ®s, &nprop)) != DDI_PROP_SUCCESS) { 719 dev_err(igc->igc_dip, CE_WARN, "failed to look up 'reg' " 720 "property: %d", ret); 721 return (false); 722 } 723 724 /* 725 * We fill out the function and command word. We currently don't fill 726 * out the bus type, speed, and width as it's not used by the common 727 * code, leaving it all at unknown. We can grab that information when it 728 * needs it. We do fill out the function and command word as the former 729 * is important and the latter is easy to grab. 730 */ 731 igc->igc_hw.bus.func = PCI_REG_FUNC_G(regs[0]); 732 igc->igc_hw.bus.pci_cmd_word = pci_config_get16(igc->igc_cfgspace, 733 PCI_CONF_COMM); 734 ddi_prop_free(regs); 735 736 /* 737 * The common code asks for the memory mapped address to be set in its 738 * structure. Though in theory it promises not to use it. 739 */ 740 igc->igc_hw.hw_addr = (uint8_t *)igc->igc_regs_base; 741 742 if ((ret = igc_set_mac_type(&igc->igc_hw)) != IGC_SUCCESS) { 743 dev_err(igc->igc_dip, CE_WARN, "failed to set mac type: %d", 744 ret); 745 return (false); 746 } 747 748 if ((ret = igc_setup_init_funcs(&igc->igc_hw, true)) != IGC_SUCCESS) { 749 dev_err(igc->igc_dip, CE_WARN, "failed to setup core code " 750 "function pointers: %d", ret); 751 return (false); 752 } 753 754 /* 755 * Go ahead and attempt to get the bus information even though this 756 * doesn't actually do anything right now. 757 */ 758 if ((ret = igc_get_bus_info(&igc->igc_hw)) != IGC_SUCCESS) { 759 dev_err(igc->igc_dip, CE_WARN, "core code failed to get bus " 760 "info: %d", ret); 761 return (false); 762 } 763 764 return (true); 765 } 766 767 static bool 768 igc_limits_init(igc_t *igc) 769 { 770 switch (igc->igc_hw.mac.type) { 771 case igc_i225: 772 igc->igc_limits.il_max_rx_rings = IGC_MAX_RX_RINGS_I225; 773 igc->igc_limits.il_max_tx_rings = IGC_MAX_RX_RINGS_I225; 774 igc->igc_limits.il_max_mtu = IGC_MAX_MTU_I225; 775 break; 776 default: 777 dev_err(igc->igc_dip, CE_WARN, "unknown MAC type: %u", 778 igc->igc_hw.mac.type); 779 return (false); 780 } 781 782 return (true); 783 } 784 785 /* 786 * Determine the hardware buffer sizes that are required for the given MTU. 787 * There are a few different constraints that we try to enforce here that come 788 * from the hardware and others that come from us: 789 * 790 * 1) The hardware requires that the rx and tx sizes all be 1 KiB (0x400) byte 791 * aligned. 792 * 2) Our tx engine can handle copying across multiple descriptors, so we cap 793 * the maximum tx buffer size at one page. 794 * 3) Right now our rx engine does not handle scanning multiple buffers for rx 795 * (see the theory statement), so we end up making the rx buffer have to fix the 796 * maximum frame size. 797 * 4) rx buffers need to also account for IP alignment, so we make sure to 798 * allocate extra bytes for that. 799 */ 800 void 801 igc_hw_buf_update(igc_t *igc) 802 { 803 unsigned long pagesize = ddi_ptob(igc->igc_dip, 1); 804 uint32_t tx_mtu; 805 806 igc->igc_max_frame = igc->igc_mtu + sizeof (struct ether_vlan_header) + 807 ETHERFCSL; 808 igc->igc_rx_buf_size = P2ROUNDUP_TYPED(igc->igc_max_frame + 809 IGC_RX_BUF_IP_ALIGN, IGC_BUF_ALIGN, uint32_t); 810 tx_mtu = P2ROUNDUP_TYPED(igc->igc_max_frame, IGC_BUF_ALIGN, uint32_t); 811 igc->igc_tx_buf_size = MIN(tx_mtu, pagesize); 812 } 813 814 static bool 815 igc_intr_init(igc_t *igc) 816 { 817 int ret, types, nintrs, navail, req; 818 const int min_nintrs = 2; 819 820 if ((ret = ddi_intr_get_supported_types(igc->igc_dip, &types)) != 821 DDI_SUCCESS) { 822 dev_err(igc->igc_dip, CE_WARN, "failed to get supported " 823 "interrupts: %d", ret); 824 return (false); 825 } 826 827 /* 828 * For now, we simplify our lives and device support by only supporting 829 * MSI-X interrupts. When we find versions of this without MSI-X 830 * support, we can go and add what we need. 831 */ 832 if ((types & DDI_INTR_TYPE_MSIX) == 0) { 833 dev_err(igc->igc_dip, CE_WARN, "device does not support MSI-X, " 834 "found %d", types); 835 return (false); 836 } 837 838 if ((ret = ddi_intr_get_nintrs(igc->igc_dip, DDI_INTR_TYPE_MSIX, 839 &nintrs)) != DDI_SUCCESS) { 840 dev_err(igc->igc_dip, CE_WARN, "failed to get number of " 841 "supported MSI-X interrupts: %d", ret); 842 return (false); 843 } 844 845 if (nintrs < min_nintrs) { 846 dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires " 847 "%d MSI-X interrupts be supported, found %d", min_nintrs, 848 nintrs); 849 return (false); 850 } 851 852 if ((ret = ddi_intr_get_navail(igc->igc_dip, DDI_INTR_TYPE_MSIX, 853 &navail)) != DDI_SUCCESS) { 854 dev_err(igc->igc_dip, CE_WARN, "failed to get number of " 855 "available MSI-X interrupts: %d", ret); 856 return (false); 857 } 858 859 if (navail < min_nintrs) { 860 dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires " 861 "%d MSI-X interrupts be available, found %d", min_nintrs, 862 navail); 863 return (false); 864 } 865 866 /* 867 * In the future this could be based upon the multiple queues that the 868 * device supports, but for now it's limited to two. See 'Rings and 869 * Interrupts' in the theory statement for more background. 870 */ 871 req = min_nintrs; 872 req = MIN(req, navail); 873 igc->igc_intr_size = req * sizeof (ddi_intr_handle_t); 874 igc->igc_intr_handles = kmem_alloc(igc->igc_intr_size, KM_SLEEP); 875 876 if ((ret = ddi_intr_alloc(igc->igc_dip, igc->igc_intr_handles, 877 DDI_INTR_TYPE_MSIX, 0, req, &igc->igc_nintrs, 878 DDI_INTR_ALLOC_NORMAL)) != DDI_SUCCESS) { 879 dev_err(igc->igc_dip, CE_WARN, "failed to allocate interrupts: " 880 "%d", ret); 881 return (false); 882 } 883 884 igc->igc_intr_type = DDI_INTR_TYPE_MSIX; 885 igc->igc_attach |= IGC_ATTACH_INTR_ALLOC; 886 if (igc->igc_nintrs < min_nintrs) { 887 dev_err(igc->igc_dip, CE_WARN, "received %d interrupts, but " 888 "needed at least %d", igc->igc_nintrs, min_nintrs); 889 return (false); 890 } 891 892 if ((ret = ddi_intr_get_pri(igc->igc_intr_handles[0], 893 &igc->igc_intr_pri)) != DDI_SUCCESS) { 894 dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt " 895 "priority: %d", ret); 896 return (false); 897 } 898 899 if ((ret = ddi_intr_get_cap(igc->igc_intr_handles[0], 900 &igc->igc_intr_cap)) != DDI_SUCCESS) { 901 dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt " 902 "capabilities: %d", ret); 903 return (false); 904 } 905 906 return (true); 907 } 908 909 /* 910 * As part of allocating our rings we make the following assumptions about 911 * interrupt assignments. All tx rings share interrupt 0. All rx rings have 912 * separate interrupts starting from interrupt 1. This design may likely change 913 * in the face of actual multi-ring support 914 */ 915 static bool 916 igc_rings_alloc(igc_t *igc) 917 { 918 uint32_t intr = 0; 919 igc->igc_tx_rings = kmem_zalloc(sizeof (igc_tx_ring_t) * 920 igc->igc_ntx_rings, KM_SLEEP); 921 922 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 923 igc->igc_tx_rings[i].itr_igc = igc; 924 igc->igc_tx_rings[i].itr_idx = i; 925 igc->igc_tx_rings[i].itr_intr_idx = intr; 926 mutex_init(&igc->igc_tx_rings[i].itr_lock, NULL, MUTEX_DRIVER, 927 DDI_INTR_PRI(igc->igc_intr_pri)); 928 if (!igc_tx_ring_stats_init(igc, &igc->igc_tx_rings[i])) { 929 return (false); 930 } 931 } 932 933 igc->igc_rx_rings = kmem_zalloc(sizeof (igc_rx_ring_t) * 934 igc->igc_nrx_rings, KM_SLEEP); 935 intr = 1; 936 937 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++, intr++) { 938 igc->igc_rx_rings[i].irr_igc = igc; 939 igc->igc_rx_rings[i].irr_idx = i; 940 igc->igc_rx_rings[i].irr_intr_idx = intr; 941 mutex_init(&igc->igc_rx_rings[i].irr_lock, NULL, MUTEX_DRIVER, 942 DDI_INTR_PRI(igc->igc_intr_pri)); 943 mutex_init(&igc->igc_rx_rings[i].irr_free_lock, NULL, 944 MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri)); 945 cv_init(&igc->igc_rx_rings[i].irr_free_cv, NULL, CV_DRIVER, 946 NULL); 947 if (!igc_rx_ring_stats_init(igc, &igc->igc_rx_rings[i])) { 948 return (false); 949 } 950 } 951 952 ASSERT3U(intr, ==, igc->igc_nintrs); 953 954 return (true); 955 } 956 957 /* 958 * Allocate our interrupts. Note, we have more or less constrained the device 959 * right now to only request two interrupts which we use in a fixed way. If we 960 * end up with more varied queue support then this should be changed around. 961 */ 962 static bool 963 igc_intr_hdlr_init(igc_t *igc) 964 { 965 int ret; 966 967 if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[0], 968 igc_intr_tx_other, igc, NULL)) != DDI_SUCCESS) { 969 dev_err(igc->igc_dip, CE_WARN, "failed to add tx/other " 970 "interrupt handler: %d", ret); 971 return (false); 972 } 973 974 if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[1], 975 igc_intr_rx_queue, igc, (uintptr_t)0)) != DDI_SUCCESS) { 976 dev_err(igc->igc_dip, CE_WARN, "failed to add rx interrupt " 977 "handler: %d", ret); 978 if ((ret = ddi_intr_remove_handler(igc->igc_intr_handles[0])) != 979 DDI_SUCCESS) { 980 dev_err(igc->igc_dip, CE_WARN, "failed to remove " 981 "tx/other interrupt handler"); 982 } 983 return (false); 984 } 985 986 return (true); 987 } 988 989 static void 990 igc_hw_control(igc_t *igc, bool take) 991 { 992 uint32_t ctrl = igc_read32(igc, IGC_CTRL_EXT); 993 994 if (take) { 995 ctrl |= IGC_CTRL_EXT_DRV_LOAD; 996 } else { 997 ctrl &= ~IGC_CTRL_EXT_DRV_LOAD; 998 } 999 1000 igc_write32(igc, IGC_CTRL_EXT, ctrl); 1001 } 1002 1003 /* 1004 * Basic device initialization and sanity check. This covers that we can 1005 * properly reset the device, validate its checksum, and get a valid MAC 1006 * address. 1007 */ 1008 static bool 1009 igc_hw_init(igc_t *igc) 1010 { 1011 int ret; 1012 uint32_t eecd; 1013 1014 if ((ret = igc_reset_hw(&igc->igc_hw)) != IGC_SUCCESS) { 1015 dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d", 1016 ret); 1017 return (false); 1018 } 1019 1020 /* 1021 * Goodbye firmware. 1022 */ 1023 igc_hw_control(igc, true); 1024 1025 /* 1026 * Check the NVM validiity if a device is present. 1027 */ 1028 eecd = igc_read32(igc, IGC_EECD); 1029 if ((eecd & IGC_EECD_EE_DET) != 0) { 1030 if ((ret = igc_validate_nvm_checksum(&igc->igc_hw)) != 1031 IGC_SUCCESS) { 1032 dev_err(igc->igc_dip, CE_WARN, "failed to validate " 1033 "igc NVM checksum: %d", ret); 1034 return (false); 1035 } 1036 } 1037 1038 if ((ret = igc_read_mac_addr(&igc->igc_hw)) != IGC_SUCCESS) { 1039 dev_err(igc->igc_dip, CE_WARN, "failed to read MAC address: %d", 1040 ret); 1041 return (false); 1042 } 1043 1044 if ((ret = igc_get_phy_id(&igc->igc_hw)) != IGC_SUCCESS) { 1045 dev_err(igc->igc_dip, CE_WARN, "failed to get PHY id: %d", ret); 1046 return (false); 1047 } 1048 1049 return (true); 1050 } 1051 1052 /* 1053 * In case the user has modified the LED state through MAC_CAPAB_LED, restore 1054 * that back to the defaults we got when we started up the device. 1055 */ 1056 static void 1057 igc_led_fini(igc_t *igc) 1058 { 1059 igc_write32(igc, IGC_LEDCTL, igc->igc_ledctl); 1060 } 1061 1062 /* 1063 * Traditionally the Intel NIC drivers avoid touching activity pins as part of 1064 * their behavior for what we use. We also don't touch a pin if it's in SDP mode 1065 * and not being used to drive an LED as it means it's likely not for us. 1066 */ 1067 static bool 1068 igc_led_ignore(i225_led_mode_t mode) 1069 { 1070 switch (mode) { 1071 case I225_LED_M_FILTER_ACT: 1072 case I225_LED_M_LINK_ACT: 1073 case I225_LED_M_SDP: 1074 case I225_LED_M_PAUSE: 1075 case I225_LED_M_ACT: 1076 return (true); 1077 default: 1078 return (false); 1079 } 1080 } 1081 1082 static inline uint32_t 1083 igc_led_bitoff(uint32_t led) 1084 { 1085 VERIFY3U(led, <, 3); 1086 return (led * 8); 1087 } 1088 1089 static inline uint32_t 1090 igc_led_get_mode(uint32_t led, uint32_t reg) 1091 { 1092 uint32_t off = igc_led_bitoff(led); 1093 return (bitx32(reg, 3 + off, off)); 1094 } 1095 1096 static inline uint32_t 1097 igc_led_set_mode(uint32_t led, uint32_t reg, i225_led_mode_t mode) 1098 { 1099 uint32_t off = igc_led_bitoff(led); 1100 return (bitset32(reg, 3 + off, off, mode)); 1101 } 1102 1103 static inline uint32_t 1104 igc_led_get_ivrt(uint32_t led, uint32_t reg) 1105 { 1106 uint32_t off = igc_led_bitoff(led) + 6; 1107 return (bitx32(reg, off, off)); 1108 } 1109 1110 static inline uint32_t 1111 igc_led_set_blink(uint32_t led, uint32_t reg, bool en) 1112 { 1113 uint32_t off = igc_led_bitoff(led) + 7; 1114 return (bitset32(reg, off, off, en)); 1115 } 1116 1117 /* 1118 * There are three LEDs on the chip. The reference defines LED0 for 1 GbE link 1119 * up, LED1 for a 2.5GbE link up, and LED 2 for activity. However, this is all 1120 * controllable in the NVM so we shouldn't assume that these have any of their 1121 * default values. We instead read the LEDCTL register to see how it was set up 1122 * by default (though the NVM would likely be better). We then create pre-canned 1123 * LEDCTL register values for on, off, and default. See igc_osdep.h for some of 1124 * the caveats in definitions here. Note, we only tweak the non-activity LEDs 1125 * and if an LED has been indicated that it's being used for SDP, we don't touch 1126 * it. 1127 */ 1128 static void 1129 igc_led_init(igc_t *igc) 1130 { 1131 uint32_t led = igc_read32(igc, IGC_LEDCTL); 1132 1133 igc->igc_ledctl = led; 1134 igc->igc_ledctl_on = led; 1135 igc->igc_ledctl_off = led; 1136 igc->igc_ledctl_blink = led; 1137 1138 for (uint32_t i = 0; i < IGC_I225_NLEDS; i++) { 1139 i225_led_mode_t mode = igc_led_get_mode(i, led); 1140 if (!igc_led_ignore(mode)) { 1141 /* 1142 * If the inversion logic is on, that changes what the 1143 * on and off modes mean, so we need to change how we 1144 * set that appropriately. 1145 */ 1146 if (igc_led_get_ivrt(i, led) != 0) { 1147 igc->igc_ledctl_on = igc_led_set_mode(i, 1148 igc->igc_ledctl_on, I225_LED_M_OFF); 1149 igc->igc_ledctl_off = igc_led_set_mode(i, 1150 igc->igc_ledctl_off, I225_LED_M_ON); 1151 igc->igc_ledctl_blink = igc_led_set_mode(i, 1152 igc->igc_ledctl_blink, I225_LED_M_OFF); 1153 } else { 1154 igc->igc_ledctl_on = igc_led_set_mode(i, 1155 igc->igc_ledctl_on, I225_LED_M_ON); 1156 igc->igc_ledctl_off = igc_led_set_mode(i, 1157 igc->igc_ledctl_off, I225_LED_M_OFF); 1158 igc->igc_ledctl_blink = igc_led_set_mode(i, 1159 igc->igc_ledctl_blink, I225_LED_M_ON); 1160 } 1161 } 1162 1163 igc->igc_ledctl_blink = igc_led_set_blink(i, 1164 igc->igc_ledctl_blink, true); 1165 } 1166 1167 igc->igc_led_mode = MAC_LED_DEFAULT; 1168 } 1169 1170 static void 1171 igc_write_ivar(igc_t *igc, uint32_t queue, bool rx, uint32_t msix) 1172 { 1173 const uint32_t ivarno = queue >> 1; 1174 const uint32_t reg = IGC_IVAR0 + ivarno * 4; 1175 const uint32_t val = msix | IGC_IVAR_VALID; 1176 uint32_t bitoff, bitend, ivar; 1177 1178 if (rx) { 1179 if ((queue % 2) == 0) { 1180 bitoff = IGC_IVAR_RX0_START; 1181 } else { 1182 bitoff = IGC_IVAR_RX1_START; 1183 } 1184 } else { 1185 if ((queue % 2) == 0) { 1186 bitoff = IGC_IVAR_TX0_START; 1187 } else { 1188 bitoff = IGC_IVAR_TX1_START; 1189 } 1190 } 1191 bitend = bitoff + IGC_IVAR_ENT_LEN - 1; 1192 1193 ivar = igc_read32(igc, reg); 1194 ivar = bitset32(ivar, bitend, bitoff, val); 1195 igc_write32(igc, reg, ivar); 1196 igc->igc_eims |= 1 << msix; 1197 } 1198 1199 /* 1200 * Here we need to go through and initialize the hardware's notion of how 1201 * interrupts are mapped to causes. The device must be specifically enabled for 1202 * MSI-X and then this is also where we go ensure that all of our interrupt 1203 * coalescing is properly enabled. Note, we must first touch the GPIE register 1204 * to enable MSI-X settings otherwise later settings won't do anything. 1205 */ 1206 static void 1207 igc_hw_intr_init(igc_t *igc) 1208 { 1209 uint32_t gpie, ivar; 1210 1211 gpie = IGC_GPIE_NSICR | IGC_GPIE_MSIX_MODE | IGC_GPIE_EIAME | 1212 IGC_GPIE_PBA; 1213 igc_write32(igc, IGC_GPIE, gpie); 1214 1215 /* 1216 * Other causes are always explicitly mapped to cause 0. Each ring then 1217 * has its own mapping. In the MISC IVAR, these start at bit 8. We leave 1218 * the '0 |' out below just to avoid a compiler complaining. We also 1219 * must unamsk this interrupt cause, which is in bit 0. 1220 */ 1221 ivar = IGC_IVAR_VALID << 8; 1222 igc_write32(igc, IGC_IVAR_MISC, ivar); 1223 igc->igc_eims = 1; 1224 1225 /* 1226 * There are a few IVAR registers available in hardware. Each IVAR 1227 * register handles mapping a given queue to an MSI-X. Each IVAR handles 1228 * two queues. 1229 */ 1230 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 1231 igc_write_ivar(igc, i, false, 1232 igc->igc_tx_rings[i].itr_intr_idx); 1233 } 1234 1235 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 1236 igc_write_ivar(igc, i, true, igc->igc_rx_rings[i].irr_intr_idx); 1237 } 1238 1239 for (uint32_t i = 0; i < igc->igc_nintrs; i++) { 1240 igc_write32(igc, IGC_EITR(i), igc->igc_eitr); 1241 } 1242 } 1243 1244 /* 1245 * Synchronize our sense of the unicast table over to the device. If this is the 1246 * first time that we're here due to attach, we need to go through and allocate 1247 * the tracking table. 1248 */ 1249 static void 1250 igc_unicast_sync(igc_t *igc) 1251 { 1252 ASSERT(MUTEX_HELD(&igc->igc_lock)); 1253 1254 if (igc->igc_ucast == NULL) { 1255 igc->igc_nucast = igc->igc_hw.mac.rar_entry_count; 1256 igc->igc_ucast = kmem_zalloc(sizeof (igc_addr_t) * 1257 igc->igc_nucast, KM_SLEEP); 1258 } 1259 1260 for (uint16_t i = 0; i < igc->igc_nucast; i++) { 1261 int ret = igc_rar_set(&igc->igc_hw, igc->igc_ucast[i].ia_mac, 1262 i); 1263 /* 1264 * Common code today guarantees this can't fail. Put this here 1265 * to ensure to guard against future updates. 1266 */ 1267 VERIFY3S(ret, ==, IGC_SUCCESS); 1268 } 1269 1270 } 1271 1272 /* 1273 * The core code interface to the multicast table requires us to give them a 1274 * packed uint8_t array that they manually walk through in ETHERADDRL (6 byte) 1275 * chunks. This must be packed. To deal with this we opt to preserve a normal 1276 * list of multicast addresses and then a secondary version that's serialized as 1277 * the core code wants it. We allocate the memory for this secondary version at 1278 * the start. 1279 */ 1280 void 1281 igc_multicast_sync(igc_t *igc) 1282 { 1283 uint16_t nvalid; 1284 1285 ASSERT(MUTEX_HELD(&igc->igc_lock)); 1286 1287 if (igc->igc_mcast == NULL) { 1288 igc->igc_nmcast = igc->igc_hw.mac.mta_reg_count; 1289 igc->igc_mcast = kmem_zalloc(sizeof (igc_addr_t) * 1290 igc->igc_nmcast, KM_SLEEP); 1291 igc->igc_mcast_raw = kmem_alloc(sizeof (ether_addr_t) * 1292 igc->igc_nmcast, KM_SLEEP); 1293 } 1294 1295 bzero(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast); 1296 nvalid = 0; 1297 for (uint16_t i = 0; i < igc->igc_nmcast; i++) { 1298 ether_addr_t *targ = &igc->igc_mcast_raw[nvalid]; 1299 1300 if (!igc->igc_mcast[i].ia_valid) 1301 continue; 1302 bcopy(igc->igc_mcast[i].ia_mac, targ, sizeof (ether_addr_t)); 1303 nvalid++; 1304 } 1305 1306 igc_update_mc_addr_list(&igc->igc_hw, (uint8_t *)igc->igc_mcast_raw, 1307 nvalid); 1308 } 1309 1310 /* 1311 * This function is used to reinitialize the PBA, our various flow control 1312 * settings, reset hardware, ensure that the EEE, DPLU, and related power modes 1313 * are in the correct state. 1314 */ 1315 bool 1316 igc_hw_common_init(igc_t *igc) 1317 { 1318 int ret; 1319 uint32_t pba, hwm, hwmp, hwm2x; 1320 struct igc_hw *hw = &igc->igc_hw; 1321 1322 /* 1323 * The PBA register determines which portion is used for the receive 1324 * buffers and which is used for the transmit buffers. This follows from 1325 * the I210 and reference drivers which use 34K as the default. We 1326 * currently leave the RXPBS and TXPBS at their power-on-reset defaults. 1327 * 1328 * We set the watermark based settings similar to igb, ensuring that we 1329 * have 16-byte granularity. The general guidelines from there was that 1330 * when it comes to automatic Ethernet PAUSE frame generation we should: 1331 * 1332 * - After an XOFF, you want to receive at least two frames. We use 1333 * whichever is smaller of 9/10ths and two frames. 1334 * - The low water mark apparently wants to be closer to the high water 1335 * mark. 1336 * 1337 * See igb_init_adapter() for more information. We basically use the 1338 * same calculation it did, given that the MAC is basically the same. 1339 */ 1340 pba = IGC_PBA_34K; 1341 hwmp = (pba << 10) * 9 / 10; 1342 hwm2x = (pba << 10) - 2 * igc->igc_max_frame; 1343 hwm = MIN(hwmp, hwm2x); 1344 1345 hw->fc.high_water = hwm & 0xfffffff0; 1346 hw->fc.low_water = igc->igc_hw.fc.high_water - 16; 1347 1348 /* 1349 * Use the suggested default pause time. 1350 */ 1351 hw->fc.pause_time = IGC_FC_PAUSE_TIME; 1352 hw->fc.send_xon = true; 1353 1354 if ((ret = igc_reset_hw(hw)) != IGC_SUCCESS) { 1355 dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d", 1356 ret); 1357 return (false); 1358 } 1359 1360 if ((ret = igc_init_hw(hw)) != IGC_SUCCESS) { 1361 dev_err(igc->igc_dip, CE_WARN, "failed to init hardware: %d", 1362 ret); 1363 return (false); 1364 } 1365 1366 /* 1367 * Clear wake on LAN and set other power states. In addition, disable 1368 * EEE for now. 1369 */ 1370 igc_write32(igc, IGC_WUC, 0); 1371 1372 if ((ret = igc_set_d0_lplu_state(hw, false)) != IGC_SUCCESS) { 1373 dev_err(igc->igc_dip, CE_WARN, "failed to set D0 LPLU mode: %d", 1374 ret); 1375 return (false); 1376 } 1377 1378 /* 1379 * There have been reports that enabling EEE for some 2.5G devices has 1380 * led to issues with the I225/226. It's not entirely clear, but we 1381 * default to disabling this like in igb/e1000g for now. 1382 */ 1383 if ((ret = igc_set_eee_i225(hw, false, false, false)) != IGC_SUCCESS) { 1384 dev_err(igc->igc_dip, CE_WARN, "failed to set EEE mode: %d", 1385 ret); 1386 return (false); 1387 } 1388 1389 igc_hw_intr_init(igc); 1390 1391 mutex_enter(&igc->igc_lock); 1392 igc_unicast_sync(igc); 1393 igc_multicast_sync(igc); 1394 1395 igc->igc_hw.mac.get_link_status = true; 1396 (void) igc_get_phy_info(hw); 1397 (void) igc_check_for_link(hw); 1398 mutex_exit(&igc->igc_lock); 1399 1400 return (true); 1401 } 1402 1403 static bool 1404 igc_intr_en(igc_t *igc) 1405 { 1406 int ret; 1407 1408 if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) { 1409 ret = ddi_intr_block_enable(igc->igc_intr_handles, 1410 igc->igc_nintrs); 1411 if (ret != DDI_SUCCESS) { 1412 dev_err(igc->igc_dip, CE_WARN, "failed to block " 1413 "enable interrupts: %d", ret); 1414 return (false); 1415 } 1416 } else { 1417 for (int i = 0; i < igc->igc_nintrs; i++) { 1418 ret = ddi_intr_enable(igc->igc_intr_handles[i]); 1419 if (ret != DDI_SUCCESS) { 1420 dev_err(igc->igc_dip, CE_WARN, "failed to " 1421 "enable interrupt %d: %d", i, ret); 1422 for (int clean = 0; clean < i; clean++) { 1423 ret = ddi_intr_disable( 1424 igc->igc_intr_handles[clean]); 1425 if (ret != DDI_SUCCESS) { 1426 dev_err(igc->igc_dip, CE_WARN, 1427 "failed to disable " 1428 "interrupt %d while " 1429 "unwinding: %d", i, ret); 1430 } 1431 } 1432 return (false); 1433 } 1434 } 1435 } 1436 1437 /* 1438 * Now that we've enabled interrupts here, clear any pending interrupts 1439 * and make sure hardware interrupts are enabled. 1440 */ 1441 (void) igc_read32(igc, IGC_ICR); 1442 1443 return (true); 1444 } 1445 1446 /* 1447 * Undo interrupt enablement. 1448 */ 1449 void 1450 igc_hw_intr_disable(igc_t *igc) 1451 { 1452 igc_write32(igc, IGC_EIMC, UINT32_MAX); 1453 igc_write32(igc, IGC_EIAC, 0); 1454 igc_write32(igc, IGC_IMC, UINT32_MAX); 1455 } 1456 1457 /* 1458 * This is used during the GLDv3 mc_start(9E) entry point to enable interrupts 1459 * on the device itself. 1460 */ 1461 void 1462 igc_hw_intr_enable(igc_t *igc) 1463 { 1464 uint32_t ims; 1465 1466 /* 1467 * First we clear pending interrupts. 1468 */ 1469 (void) igc_read32(igc, IGC_ICR); 1470 1471 /* 1472 * The hardware has extended and non-extended interrupt masks and 1473 * auto-clear registers. We always disable auto-clear for the 1474 * non-extended portions. See the I210 datasheet 'Setting Interrupt 1475 * Registers' for a better sense of what's going on here. 1476 * 1477 * In the IMS register we always register link status change events and 1478 * device reset assertions. 1479 */ 1480 ims = IGC_IMS_LSC | IGC_IMS_DRSTA; 1481 1482 igc_write32(igc, IGC_EIAC, igc->igc_eims); 1483 igc_write32(igc, IGC_EIMS, igc->igc_eims); 1484 igc_write32(igc, IGC_IMS, ims); 1485 igc_write32(igc, IGC_IAM, 0); 1486 } 1487 1488 static void 1489 igc_cleanup(igc_t *igc) 1490 { 1491 if (igc->igc_mcast != NULL) { 1492 ASSERT3U(igc->igc_nmcast, !=, 0); 1493 kmem_free(igc->igc_mcast_raw, sizeof (ether_addr_t) * 1494 igc->igc_nmcast); 1495 kmem_free(igc->igc_mcast, sizeof (igc_addr_t) * 1496 igc->igc_nmcast); 1497 igc->igc_nmcast = 0; 1498 igc->igc_mcast = NULL; 1499 } 1500 1501 if (igc->igc_ucast != NULL) { 1502 ASSERT3U(igc->igc_nucast, !=, 0); 1503 kmem_free(igc->igc_ucast, sizeof (igc_addr_t) * 1504 igc->igc_nucast); 1505 igc->igc_nucast = 0; 1506 igc->igc_ucast = NULL; 1507 } 1508 1509 if ((igc->igc_attach & IGC_ATTACH_INTR_EN) != 0) { 1510 int ret; 1511 if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) { 1512 ret = ddi_intr_block_disable(igc->igc_intr_handles, 1513 igc->igc_nintrs); 1514 if (ret != DDI_SUCCESS) { 1515 dev_err(igc->igc_dip, CE_WARN, "failed to " 1516 "block disable interrupts: %d", ret); 1517 } 1518 } else { 1519 for (int i = 0; i < igc->igc_nintrs; i++) { 1520 ret = ddi_intr_disable( 1521 igc->igc_intr_handles[i]); 1522 if (ret != DDI_SUCCESS) { 1523 dev_err(igc->igc_dip, CE_WARN, "failed " 1524 "to disable interrupt %d: %d", i, 1525 ret); 1526 } 1527 } 1528 } 1529 igc->igc_attach &= ~IGC_ATTACH_INTR_EN; 1530 } 1531 1532 if ((igc->igc_attach & IGC_ATTACH_MAC) != 0) { 1533 int ret = mac_unregister(igc->igc_mac_hdl); 1534 if (ret != 0) { 1535 dev_err(igc->igc_dip, CE_WARN, "failed to unregister " 1536 "MAC handle: %d", ret); 1537 } 1538 igc->igc_attach &= ~IGC_ATTACH_MAC; 1539 } 1540 1541 if ((igc->igc_attach & IGC_ATTACH_STATS) != 0) { 1542 igc_stats_fini(igc); 1543 igc->igc_attach &= ~IGC_ATTACH_STATS; 1544 } 1545 1546 if ((igc->igc_attach & IGC_ATTACH_LED) != 0) { 1547 igc_led_fini(igc); 1548 igc->igc_attach &= ~IGC_ATTACH_LED; 1549 } 1550 1551 if ((igc->igc_attach & IGC_ATTACH_INTR_HANDLER) != 0) { 1552 for (int i = 0; i < igc->igc_nintrs; i++) { 1553 int ret = 1554 ddi_intr_remove_handler(igc->igc_intr_handles[i]); 1555 if (ret != 0) { 1556 dev_err(igc->igc_dip, CE_WARN, "failed to " 1557 "remove interrupt %d handler: %d", i, ret); 1558 } 1559 } 1560 igc->igc_attach &= ~IGC_ATTACH_INTR_HANDLER; 1561 } 1562 1563 if (igc->igc_tx_rings != NULL) { 1564 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 1565 igc_tx_ring_stats_fini(&igc->igc_tx_rings[i]); 1566 mutex_destroy(&igc->igc_tx_rings[i].itr_lock); 1567 } 1568 kmem_free(igc->igc_tx_rings, sizeof (igc_tx_ring_t) * 1569 igc->igc_ntx_rings); 1570 igc->igc_tx_rings = NULL; 1571 } 1572 1573 if (igc->igc_rx_rings != NULL) { 1574 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 1575 igc_rx_ring_stats_fini(&igc->igc_rx_rings[i]); 1576 cv_destroy(&igc->igc_rx_rings[i].irr_free_cv); 1577 mutex_destroy(&igc->igc_rx_rings[i].irr_free_lock); 1578 mutex_destroy(&igc->igc_rx_rings[i].irr_lock); 1579 } 1580 kmem_free(igc->igc_rx_rings, sizeof (igc_rx_ring_t) * 1581 igc->igc_nrx_rings); 1582 igc->igc_rx_rings = NULL; 1583 } 1584 1585 if ((igc->igc_attach & IGC_ATTACH_MUTEX) != 0) { 1586 mutex_destroy(&igc->igc_lock); 1587 igc->igc_attach &= ~IGC_ATTACH_MUTEX; 1588 } 1589 1590 if ((igc->igc_attach & IGC_ATTACH_INTR_ALLOC) != 0) { 1591 for (int i = 0; i < igc->igc_nintrs; i++) { 1592 int ret = ddi_intr_free(igc->igc_intr_handles[i]); 1593 if (ret != DDI_SUCCESS) { 1594 dev_err(igc->igc_dip, CE_WARN, "unexpected " 1595 "failure freeing interrupt %d: %d", i, ret); 1596 } 1597 } 1598 igc->igc_attach &= ~IGC_ATTACH_INTR_ALLOC; 1599 } 1600 1601 if (igc->igc_intr_handles != NULL) { 1602 ASSERT3U(igc->igc_intr_size, !=, 0); 1603 kmem_free(igc->igc_intr_handles, igc->igc_intr_size); 1604 } 1605 1606 /* 1607 * Now that we're almost done, begrudgingly let firmware know we're 1608 * done. 1609 */ 1610 igc_hw_control(igc, false); 1611 1612 if (igc->igc_regs_hdl != NULL) { 1613 ddi_regs_map_free(&igc->igc_regs_hdl); 1614 igc->igc_regs_base = NULL; 1615 } 1616 1617 if (igc->igc_cfgspace != NULL) { 1618 pci_config_teardown(&igc->igc_cfgspace); 1619 } 1620 igc->igc_attach &= ~IGC_ATTACH_REGS; 1621 1622 ddi_set_driver_private(igc->igc_dip, NULL); 1623 igc->igc_dip = NULL; 1624 1625 VERIFY0(igc->igc_attach); 1626 1627 kmem_free(igc, sizeof (igc_t)); 1628 } 1629 1630 static int 1631 igc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1632 { 1633 igc_t *igc; 1634 1635 if (cmd != DDI_ATTACH) { 1636 return (DDI_FAILURE); 1637 } 1638 1639 igc = kmem_zalloc(sizeof (igc_t), KM_SLEEP); 1640 ddi_set_driver_private(dip, igc); 1641 igc->igc_dip = dip; 1642 1643 /* 1644 * Initialize a few members that are not zero-based. 1645 */ 1646 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN; 1647 igc->igc_link_state = LINK_STATE_UNKNOWN; 1648 1649 /* 1650 * Set up all the register spaces that hardware requires. 1651 */ 1652 if (!igc_setup_regs(igc)) { 1653 goto err; 1654 } 1655 igc->igc_attach |= IGC_ATTACH_REGS; 1656 1657 /* 1658 * Setup the common code. 1659 */ 1660 if (!igc_core_code_init(igc)) { 1661 goto err; 1662 } 1663 1664 if (!igc_limits_init(igc)) { 1665 goto err; 1666 } 1667 1668 /* 1669 * Go allocate and set up all of our interrupts. 1670 */ 1671 if (!igc_intr_init(igc)) { 1672 goto err; 1673 } 1674 1675 /* 1676 * Initialize our main mutex for the device now that we have an 1677 * interrupt priority. 1678 */ 1679 mutex_init(&igc->igc_lock, NULL, MUTEX_DRIVER, 1680 DDI_INTR_PRI(igc->igc_intr_pri)); 1681 igc->igc_attach |= IGC_ATTACH_MUTEX; 1682 1683 /* 1684 * We now want to determine the total number of rx and tx rings that we 1685 * have based on our interrupt allocation so we can go through and 1686 * perform the rest of the device setup that is required. The various 1687 * queues that we have are mapped to a given MSI-X through the IVAR 1688 * registers in the device. There is also an IVAR_MISC register that 1689 * maps link state change events and other issues up to two vectors. 1690 * 1691 * There isn't strictly per-queue interrupt generation control. Instead, 1692 * when in MSI-X mode, the device has an extended interrupt cause and 1693 * mask register. The mask register allows us to mask the five bits 1694 * described above. 1695 * 1696 * Because of all this we end up limiting the number of queues that we 1697 * use to 2 for now: 1 for tx and 1 for rx. Interrupt 0 is for tx/other 1698 * and 1 for rx. 1699 */ 1700 igc->igc_nrx_rings = 1; 1701 igc->igc_ntx_rings = 1; 1702 1703 /* 1704 * Default to a 1500 byte MTU. 1705 */ 1706 igc->igc_mtu = ETHERMTU; 1707 igc_hw_buf_update(igc); 1708 1709 /* 1710 * Initialize default descriptor limits and thresholds. We allocate 1.5 1711 * times the number of rx descriptors so that way we can loan up to 1712 * 1/3rd of them. We allocate an even number of tx descriptors. 1713 */ 1714 igc->igc_rx_ndesc = IGC_DEF_RX_RING_SIZE; 1715 igc->igc_tx_ndesc = IGC_DEF_TX_RING_SIZE; 1716 igc->igc_rx_nbuf = igc->igc_rx_ndesc + (igc->igc_rx_ndesc >> 1); 1717 igc->igc_tx_nbuf = igc->igc_tx_ndesc; 1718 igc->igc_rx_nfree = igc->igc_rx_nbuf - igc->igc_rx_ndesc; 1719 igc->igc_rx_intr_nframes = IGC_DEF_RX_RING_INTR_LIMIT; 1720 igc->igc_rx_bind_thresh = IGC_DEF_RX_BIND; 1721 igc->igc_tx_bind_thresh = IGC_DEF_TX_BIND; 1722 igc->igc_tx_notify_thresh = IGC_DEF_TX_NOTIFY_MIN; 1723 igc->igc_tx_recycle_thresh = IGC_DEF_TX_RECYCLE_MIN; 1724 igc->igc_tx_gap = IGC_DEF_TX_GAP; 1725 igc->igc_eitr = IGC_DEF_EITR; 1726 1727 if (!igc_rings_alloc(igc)) { 1728 goto err; 1729 } 1730 1731 if (!igc_intr_hdlr_init(igc)) { 1732 goto err; 1733 } 1734 igc->igc_attach |= IGC_ATTACH_INTR_HANDLER; 1735 1736 /* 1737 * Next reset the device before we begin initializing anything else. As 1738 * part of this, validate the flash checksum if present. This is all 1739 * initialization that we would only do once per device. Other 1740 * initialization that we want to do after any reset is done is 1741 * igc_hw_common_init(). 1742 */ 1743 if (!igc_hw_init(igc)) { 1744 goto err; 1745 } 1746 1747 igc_led_init(igc); 1748 igc->igc_attach |= IGC_ATTACH_LED; 1749 1750 /* 1751 * Snapshot our basic settings that users can eventually control in the 1752 * device. We start with always enabling auto-negotiation and 1753 * advertising the basic supported speeds. The I225v1 does have 1754 * substantial problems with enabling 2.5G due to the fact that it 1755 * doesn't maintain a proper inter-packet gap. Despite that, we default 1756 * to enabling 2.5G for now as its supposedly not broken with all link 1757 * partners and the NVM. We also don't have a way of actually 1758 * identifying and mapping that to something in the driver today, 1759 * unfortunately. 1760 */ 1761 igc->igc_hw.mac.autoneg = true; 1762 igc->igc_hw.phy.autoneg_wait_to_complete = false; 1763 igc->igc_hw.phy.autoneg_advertised = IGC_DEFAULT_ADV; 1764 igc->igc_hw.fc.requested_mode = igc_fc_default; 1765 igc->igc_hw.fc.current_mode = igc_fc_default; 1766 1767 if (!igc_hw_common_init(igc)) { 1768 goto err; 1769 } 1770 1771 if (!igc_stats_init(igc)) { 1772 goto err; 1773 } 1774 igc->igc_attach |= IGC_ATTACH_STATS; 1775 1776 /* 1777 * Register with MAC 1778 */ 1779 if (!igc_mac_register(igc)) { 1780 goto err; 1781 } 1782 igc->igc_attach |= IGC_ATTACH_MAC; 1783 1784 /* 1785 * Enable interrupts and get going. 1786 */ 1787 if (!igc_intr_en(igc)) { 1788 goto err; 1789 } 1790 igc->igc_attach |= IGC_ATTACH_INTR_EN; 1791 1792 return (DDI_SUCCESS); 1793 1794 err: 1795 igc_cleanup(igc); 1796 return (DDI_FAILURE); 1797 } 1798 1799 static int 1800 igc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1801 { 1802 igc_t *igc; 1803 1804 if (cmd != DDI_DETACH) { 1805 return (DDI_FAILURE); 1806 } 1807 1808 igc = ddi_get_driver_private(dip); 1809 if (igc == NULL) { 1810 dev_err(dip, CE_WARN, "asked to detach, but missing igc_t"); 1811 return (DDI_FAILURE); 1812 } 1813 1814 igc_cleanup(igc); 1815 return (DDI_SUCCESS); 1816 } 1817 1818 static struct cb_ops igc_cb_ops = { 1819 .cb_open = nulldev, 1820 .cb_close = nulldev, 1821 .cb_strategy = nodev, 1822 .cb_print = nodev, 1823 .cb_dump = nodev, 1824 .cb_read = nodev, 1825 .cb_write = nodev, 1826 .cb_ioctl = nodev, 1827 .cb_devmap = nodev, 1828 .cb_mmap = nodev, 1829 .cb_segmap = nodev, 1830 .cb_chpoll = nochpoll, 1831 .cb_prop_op = ddi_prop_op, 1832 .cb_flag = D_MP, 1833 .cb_rev = CB_REV, 1834 .cb_aread = nodev, 1835 .cb_awrite = nodev 1836 }; 1837 1838 static struct dev_ops igc_dev_ops = { 1839 .devo_rev = DEVO_REV, 1840 .devo_refcnt = 0, 1841 .devo_getinfo = NULL, 1842 .devo_identify = nulldev, 1843 .devo_probe = nulldev, 1844 .devo_attach = igc_attach, 1845 .devo_detach = igc_detach, 1846 .devo_reset = nodev, 1847 .devo_quiesce = ddi_quiesce_not_supported, 1848 .devo_cb_ops = &igc_cb_ops 1849 }; 1850 1851 static struct modldrv igc_modldrv = { 1852 .drv_modops = &mod_driverops, 1853 .drv_linkinfo = "Intel I226/226 Ethernet Controller", 1854 .drv_dev_ops = &igc_dev_ops 1855 }; 1856 1857 static struct modlinkage igc_modlinkage = { 1858 .ml_rev = MODREV_1, 1859 .ml_linkage = { &igc_modldrv, NULL } 1860 }; 1861 1862 int 1863 _init(void) 1864 { 1865 int ret; 1866 1867 mac_init_ops(&igc_dev_ops, IGC_MOD_NAME); 1868 1869 if ((ret = mod_install(&igc_modlinkage)) != 0) { 1870 mac_fini_ops(&igc_dev_ops); 1871 } 1872 1873 return (ret); 1874 } 1875 1876 int 1877 _info(struct modinfo *modinfop) 1878 { 1879 return (mod_info(&igc_modlinkage, modinfop)); 1880 } 1881 1882 int 1883 _fini(void) 1884 { 1885 int ret; 1886 1887 if ((ret = mod_remove(&igc_modlinkage)) == 0) { 1888 mac_fini_ops(&igc_dev_ops); 1889 } 1890 1891 return (ret); 1892 } 1893