xref: /illumos-gate/usr/src/uts/common/io/igc/igc.c (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 /*
17  * Intel I225/226 Ethernet Driver
18  * ------------------------------
19  *
20  * This driver implements support for the Intel I225 and I226 Ethernet
21  * controllers which support up to 2.5 GbE and generally only supports BASE-T
22  * copper phys. This device is yet another variant on the venerable Intel 1 GbE
23  * devices that are found in e1000g(4D) and igb(4D). This is its own driver in
24  * part because that's how Intel did things and refactored their common code
25  * which we import and is found in the 'core' directory.
26  *
27  * There is not a good datasheet for the MAC that we've been able to find for
28  * this part. It's not clear that Intel even has a doc for this in their
29  * Resource and Design Center. The primary datasheet documents the NVM and other
30  * parts of it, but not the software interface. Based on observations from the
31  * common code we describe this as somewhat of an evolution of the I217 and
32  * I210, with less features than the I210, which comes from the server world
33  * (which ws itself a more stripped down I350).
34  *
35  * The result of all this is us trying to focus on what we know about this part
36  * and making some assumptions along the way. This includes things like:
37  *
38  * 1) We believe that the device only supports up to 4 RX and TX queues.
39  * 2) There is only one TX context for each TX queue and it is mapped to the
40  * queue.
41  * 3) There is no support for the head writeback modes that we've found.
42  * 4) This does otherwise support both the MSI-X and MSI/INTx interrupt
43  * management which are shaped very differently in the device.
44  * 5) The 2500BASE-T PHY support is unique, but the other PHY settings are
45  * roughly the same as far as we can tell.
46  *
47  * There are certainly more differences than the points up above, but the above
48  * are ones that generally influence our design.
49  *
50  * ------------
51  * Organization
52  * ------------
53  *
54  * This driver is first broken into two different pieces. There is the 'core'
55  * code which we import from Intel via FreeBSD. All of these sources are in the
56  * 'uts/common/io/igc/core' directory and we try our hardest to avoid modifying
57  * them (hence the smatch gags). The core code can be thought of as abstracting
58  * the MAC, NVM, and PHY across different chipsets (right now it's all the I225)
59  * and providing us with a series of library calls that we can do to manage the
60  * chip.
61  *
62  * The remaining files that sit alongside this one implement different portions
63  * of functionality related to the device. In particular:
64  *
65  *  igc.[ch]:		This is the main entry point for the driver and the
66  *			source of this block comment. It implements all of the
67  *			basic DDI entry points: attach and detach, interrupts,
68  *			PCI config space and register set up and tear down.
69  *
70  *			The header file contains all of the structure
71  *			definitions that we use throughout this and the basic
72  *			constants we use for sizing.
73  *
74  *  igc_gld.c		This file implements all of the major GLDv3 required
75  *			entry points that are found in mac(9E). The guts of the
76  *			actual I/O are in igc_ring.c, but getting and setting
77  *			all of the various MAC properties and other bits is
78  *			here.
79  *
80  *  igc_osdep.[ch]	The osdep (OS dependent) files, are used to define and
81  *			implement the functionality required by the common code.
82  *			igc_osdep.h is included in the build of each file.
83  *
84  *			We have a second use for igc_osdep.h which is where we
85  *			put missing hardware definitions that apply. This is for
86  *			cases where the core code doesn't have it and it should
87  *			really live in igc_defines.h or igc_regs.h, but we keep
88  *			it here to avoid modifying those.
89  *
90  *  igc_ring.c		This implements the core I/O routines of the device
91  *			driver, starting with the descriptor ring setup and tear
92  *			down as well as DMA, descriptor ring, and per-frame
93  *			memory. It also implements all of the primary logic for
94  *			transmitting and receiving frames.
95  *
96  *  igc_stat.c		This file deals with kstat creation and destruction as
97  *			well as reading and fetching all of the registers that
98  *			exist in hardware.
99  *
100  * There are a few primary data structures to be aware of. Their relationships
101  * are shown in the following image and then described. Note, each structure has
102  * many more fields than those pictured:
103  *
104  * +---------------+
105  * | dev_info_t *  |
106  * |              -|-+
107  * | private data  | |
108  * +---------------+ v
109  *   +------------------------------+        +---------------------+
110  *   | igc_t                        |        | igc_addr_t          |
111  *   | per-instance primary         |  +---->|                     |
112  *   | structure                    |  |+--->| Notes a MAC address | ...
113  *   |                              |  ||    | stored in hardware  |
114  *   | igc_addr_t    *igc_ucast    -|--+|    +---------------------+
115  *   | igc_addr_t    *igc_mcast    -|---+      +---------------------------+
116  *   | struct igc_hw *igc_hw       -|--------->| struct igc_hw (core code) |
117  *   | igc_tx_ring_t *igc_tx_rings -|--+       |                           |
118  *   | igc_rx_ring_t *igc_rx_rings -|--|---+   | igc_mac_info mac          |
119  *   +------------------------------+  |   |   | igc_fc_info  fc           |
120  *                                     |   |   | igc_phy_info phy          |
121  *  +----------------------------------+   |   | igc_nvm_info nvm          |
122  *  |                                      v   +---------------------------+
123  *  |  +--------------------------------------+
124  *  |  | igc_rx_ring_t                        |
125  *  |  |                                      |
126  *  |  | igc_adv_rx_desc *irr_ring         ---|--> rx hw descriptor ring
127  *  |  | uint32_t        irr_next          ---|--> next entry to look for data
128  *  |  | igc_rx_buffer_t **irr_work_list   ---|--> corresponds to ring entries
129  *  |  | uint32_t        irr_nfree         ---|--> number of free list entries
130  *  |  | igc_rx_buffer_t **irr_free_list   ---|--> set of buffers free for bind
131  *  |  | igc_rx_buffer_t *irr_arena        ---|-+> array of all rx buffers
132  *  |  +--------------------------------------+ |
133  *  |                                           |
134  *  |          +----------------------------+   |
135  *  |          | igc_rx_buffer_t            |<--+
136  *  |          |                            |
137  *  |          | mblk_t            *igb_mp -|---> mblk_t for rx buffer
138  *  |          | igc_dma_buffer_t  irb_dma -|---> DMA memory for rx buffer
139  *  |          +----------------------------+
140  *  |
141  *  |   +------------------------------------+
142  *  +-->| igc_tx_ring_t                      |
143  *      |                                    |
144  *      | icc_adv_tx_desc   *itr_ring      --|--> tx hw descriptor ring
145  *      | uin32_t           itr_ring_head  --|--> next descriptor to recycle
146  *      | uin32_t           itr_ring_fail  --|--> next descriptor to place
147  *      | uin32_t           itr_ring_free  --|--> free descriptors in ring
148  *      | igc_tx_buffer_t   **itr_work_list  |--> corresponds to ring entries
149  *      | list_t            itr_free_list  --|--> available tx buffers
150  *      | igc_tx_buffer_t   *itr_arena     --|-+> array of all tx buffers
151  *      +------------------------------------+ |
152  *                                             |
153  *        +---------------------------------+  |
154  *        | igc_tx_buffer_t                 |<-+
155  *        |                                 |
156  *        | mblk_t           *itb_mp      --|--> mblk to tx (only in first)
157  *        | igc_dma_buffer_t itb_dma      --|--> tx DMA buffer for copy
158  *        | ddi_dma_handle_t itb_bind_hdl --|--> DMA handle for bind
159  *        +---------------------------------+
160  *
161  * igc_t		This is the primary data structure that exists for each
162  *			instance of the driver. There is generally a 1:1
163  *			relationship between a physical port, an instance of the
164  *			driver, and a PCI function. This structure provides
165  *			access to the device's registers and it embeds the
166  *			common code's struct igc_hw.
167  *
168  * struct igc_hw	This structure is used by the core code and it contains
169  *			information related to the MAC, PHY, NVM, and related
170  *			information that the device uses. In general, this
171  *			structure is used when performing API calls to the
172  *			common code. The common code calls back into us in the
173  *			igc_osdep.c interfaces.
174  *
175  * igc_tx_ring_t	This structure represents a single transmit ring in
176  *			hardware, its associated software state, and
177  *			miscellaneous data like statistics, MAC handles, etc.
178  *			See the 'TX Data Path Design' section for more
179  *			information.
180  *
181  * igc_rx_ring_t	This is the receive variant of a ring. It represents and
182  *			tracks the hardware state along with all our metadata.
183  *			One of these exists for each receive ring that we've
184  *			enabled (currently one). See the 'RX Data Path Design'
185  *			section for more information.
186  *
187  * igc_tx_buffer_t	This represents a single tx buffer in the driver. A tx
188  *			buffer contains DMA based storage that it can use to
189  *			transmit a packet and contains a second DMA handle that
190  *			can be used to bind a specific mblk_t to it. tx buffers
191  *			are capped at the current page size and can be smaller
192  *			if the maximum packet size is smaller. A 1500 byte MTU
193  *			will end up with a 2 KiB buffer due to the device's
194  *			internal alignment requirements.
195  *
196  * igc_rx_buffer_t	This represents a single rx buffer in the driver. These
197  *			buffers may be loaned up to MAC and then returned to us
198  *			later. They contain a single DMA buffer which right now
199  *			is a single contiguous buffer that fits the maximum
200  *			packet size. Each buffer has a corresponding mblk_t that
201  *			it is mapped to.
202  *
203  * igc_dma_buffer_t	This represent a DMA buffer in the system. DMA buffers
204  *			are used for transmit buffers, receive buffers, or
205  *			various ring descriptor entries. The DMA buffer
206  *			structure is not inherently limited to a specific number
207  *			of cookies. It is always mapped in our virtual address
208  *			space and encapsulates the various DDI functions. In
209  *			general, one expects to interface with the idb_va member
210  *			when needing to access the memory, the idb_size member
211  *			when wanting to understand how much memory is in the
212  *			buffer, and the idb_hdl member when needing to access
213  *			the DMA cookies.
214  *
215  * igc_addr_t		This represents a 48-bit Ethernet MAC address slot in
216  *			the hardware that may or may not be used at any given
217  *			point in time.
218  *
219  * --------------------
220  * Rings and Interrupts
221  * --------------------
222  *
223  * The I225/226 controller like the I210 supports up to 4 rx and tx rings. Due
224  * to the long history of this controller and its tradition from the e1000g/igb
225  * days and much older parts like the 8254x series, it has two entirely
226  * different sets of interrupt modes. One where MSI-X is used and a mode where
227  * a single MSI or INTx interrupt is used. Currently the driver only supports
228  * the MSI-X mode as that gives us more flexibility and due to the fact that the
229  * interrupt modes and register handling are different, reduces the complexity
230  * in the driver.
231  *
232  * The hardware uses its IVAR registers to map specific queues to interrupts.
233  * Each rx queue and tx queue is mapped to a specific bit position in the IVAR
234  * and there is an additional IVAR register for miscellaneous causes like link
235  * state changes. While the IVAR register allows for several bits for MSI-X
236  * entries, for the most part, it appears that there is only support for values
237  * in the range [0, 4] based on the I210 which we believe extends to the I225/6.
238  *
239  * MSI-X mode causes the device's various interrupt registers to be split into
240  * two groups the 'legacy' and 'extended' (sometimes called advanced) ones. The
241  * extended ones all start with 'E'. When in MSI-X mode, the EICR (cause), EICS
242  * (cause set), EIAC (auto-clear), EIMS (mask set) registers all operate with
243  * indexes that refer to the MSI-X. The primary way to temporarily disable
244  * interrupts for polling is to remove the given MSI-X from the auto-clear set
245  * and to clear it from the enabled mask set.
246  *
247  * The implication of all of this is that we can only really disable interrupts
248  * for polling on a per-MSI-X basis. This generally means that the design for
249  * interrupts and rings is that all the tx rings and the link state change
250  * events share interrupt 0, while rx rings use interrupts 1-4. Because the x86
251  * 'apix' modules end up defaulting to two interrupts to a driver, we end up
252  * only supporting a single rx and tx ring for the time being, though the driver
253  * is phrased in terms of a variable number of such rings.
254  *
255  * -------------------
256  * RX Data Path Design
257  * -------------------
258  *
259  * The rx data path is based around allocating a fixed number of receive buffers
260  * for each ring. We have two goals in the allocation buffer and ring design:
261  *
262  * 1) We want to make sure that the ring is always full of valid descriptors for
263  *    rx to prevent stalls. One implication of this is that we will always
264  *    refill a received buffer with a new one and notify the hardware that the
265  *    buffer is usable again.
266  *
267  * 2) We would prefer to not have to copy received memory and instead bind the
268  *    DMA memory directly into an mblk_t.
269  *
270  * To satisfy (1) we need to allocate at least as many rx buffers as there are
271  * ring entries. The ring is sized by default to 512 entries, which is a
272  * somewhat arbitrary, but common, size. We then say that we want to be able to
273  * loan half of our entries up the stack at any given time. This leads to us
274  * allocating 1.5x the ring size rx buffers.
275  *
276  * All of the rx buffers are stored in the irr_arena array. They are then split
277  * between the free list and the ring's work list. The work list is an array
278  * that is a 1:1 mapping to a location in the descriptor ring. That is index 4
279  * of the work list (irr_work_list[4]) corresponds to index 4 of the descriptor
280  * ring (irr_ring[4]). However, this may refer to any of the rx descriptors that
281  * is in the irr_arena. When we start up the ring, the first ring size entries
282  * are all inserted into the work list and then the remaining entries are
283  * inserted into the free list.
284  *
285  * Entries that are in the work list are always given to hardware. We track the
286  * next place for us to scan for received packets through the 'irr_next' index
287  * into the descriptor ring. When an interrupt fires, we start at irr_next and
288  * iterate through the descriptor ring continuing while we find valid, received
289  * packets. When we process a packet, we look at two things to consider whether
290  * we bind it or copy it to a new mblk_t. The first piece is the received
291  * packet's length. If the packet is small, there is not much value in binding
292  * it and instead we just allocate and copy a new buffer for the packet.
293  *
294  * The second is if there are free rx descriptors. To keep goal (1) valid, we
295  * only will loan a packet up if there is an entry on the free list that can
296  * replace the rx buffer, as otherwise we'd want to make sure we don't stall the
297  * ring. If an rx buffer is loaned, the entry on the free list takes its place
298  * in the descriptor ring and when the networking stack is finally done with the
299  * mblk_t, it'll be returned to us as part of the freemsg()/freeb() destructor.
300  * This lifetime is illustrated in the following diagram:
301  *
302  *
303  *    +-------------+                        +-----------+
304  *    | Work List   |<---*-------------------| Free List |
305  *    | Owned by HW |    . . Used to replace |   Idle    |
306  *    +-------------+        loaned buffers  +-----------+
307  *      |     | ^                                  ^
308  *      |     | . . . Reused if a                  |
309  *      |     +-+     copy is done                 . . . Returned to driver via
310  *      |                                          |     freemsg() which calls
311  *      |                                          |     igc_rx_recycle().
312  *      v                                          |
313  *    +-------------------+                        |
314  *    | Loaned            |------------------------+
315  *    | Owned by netstack |
316  *    +-------------------+
317  *
318  * Currently the rx data path uses rx buffers that are equal to the maximum size
319  * of a packet (rounded up based on hardware's 1 KiB alignment requirement).
320  * This was mostly done for initial simplicity, though it comes at a memory
321  * cost. It is possible to design this to be more like the tx subsystem where we
322  * use fixed page size buffers and just cons up an mblk_t chain with b_cont
323  * pointers.
324  *
325  * -------------------
326  * TX Data Path Design
327  * -------------------
328  *
329  * The tx data path is a bit different in design from the rx data path. When the
330  * system wants to tx data there are two fundamental building blocks that we
331  * use, both of which leverage the igc_tx_buffer_t:
332  *
333  * 1) We use the DMA memory that is allocated with the buffer and copy the
334  *    mblk_t data into it. This is used when we have small mblk_t's.
335  *
336  * 2) We utilize the DMA handle that is in the tx buffer (but not the buffer's
337  *    DMA memory) to perform DMA binding. This can result in multiple cookies
338  *    and therefore descriptors mapping to the single buffer.
339  *
340  * Because a given tx buffer may end up using more than one descriptor and we
341  * have to account for transmit context descriptors, which are used for
342  * indicating checksum and segmentation offloads, we end up only allocating a
343  * number of transmit buffers equal to the ring size. In addition, the tx data
344  * buffer's maximum size is capped at the size of a single page. This is done
345  * because we often aren't going to be copying and if we are, we don't need that
346  * much more memory. The actual size may be smaller depending on the MTU.
347  *
348  * The tx descriptor ring is used in a bit of a different way. While part of the
349  * reason for this is that we are filling it based on the stack's demands and
350  * therefore only need to fill in descriptors when there's a need, the second
351  * reason is because of how the hardware reports back events. There are two
352  * major kinds of descriptors that can be entered into the ring. There are the
353  * aforementioned context descriptors and then data descriptors. While data
354  * descriptors support an interrupt on completion, context descriptors do not.
355  *
356  * When an mblk_t comes in to be transmitted, we walk all of the mblk_t's
357  * associated with it via the b_cont pointer. For each one, we look at the size
358  * of the data and determine whether or not to perform DMA binding or to copy it
359  * into the current tx buffer. A given tx buffer can be used to copy multiple
360  * different mblk_t's. Imagine a pathological case where we had a 500 byte
361  * packet split into 125 byte chunks, this would end up using a single tx data
362  * buffer.  However, if you imagine a large chunk of TCP data, this may be
363  * spread across several mblk_t's so we may end up leveraging multiple tx data
364  * buffers.
365  *
366  * The transmit buffers that are available are stored on a free list. This is
367  * managed as a list_t as we end up needing to often track groups of descriptors
368  * to allocate and free across packet transmit and recycling. We don't count the
369  * number of transmit buffers that are free per se, but it generally tracks the
370  * number of free descriptors which do track as in the worst case there is a 1:1
371  * relationship between buffers and descriptors and more generally it's 1:n,
372  * that is there are multiple descriptors used for a single buffer.
373  *
374  * The transmit ring is managed through a combination of three integers, the
375  * itr_ring_head, the itr_ring_tail, and the itr_ring_free. The ring's tail
376  * represents the place where the driver will place new data to transmit. The
377  * ring's head represents the first place that we should check for a packet's
378  * completion when we're performing recycling (the act of acknowledging what
379  * hardware has processed internal to the driver) due to a tx interrupt or
380  * manual recycling in the transmit path.
381  *
382  * When placing a packet as a series of descriptor rings we'll end up doing the
383  * following:
384  *
385  * 1) First we determine how to map each mblk_t as mentioned above.
386  * 2) This will then be turned into descriptors in the ring. Each tx data buffer
387  *    that is used is placed in the itr_work_list at the corresponding index
388  *    that they are used in the ring. There is one special case here, if a
389  *    context descriptor is used, the first transmit buffer will refer to the
390  *    context descriptor's entry (which always comes before data).
391  * 3) We'll ensure that there are enough descriptors for this packet to fit into
392  *    the ring or if it would exceed our mandatory gap threshold. If so, then
393  *    we'll undo all the work we just did and return the mblk_t to MAC and
394  *    indicate that the ring is blocked. MAC will be notified later when we free
395  *    up transmit descriptors.
396  * 4) In the first transmit data buffer we'll store both the mblk_t and then
397  *    we'll store what the index of the last descriptor that's used is. This is
398  *    important for recycling. We also indicate that the last descriptor should
399  *    be the one that reports its status on interrupt completion.
400  * 5) We'll notify hardware that there is data for it to transmit by writing to
401  *    the ring's tail pointer.
402  *
403  * This all works reasonably okay, except for the small problem of the bill,
404  * which we pay off in the form of recycling. Recycling is going through the
405  * ring and seeing which descriptors are free. While the transmit path described
406  * above is the only path that is allowed to move the tail, the recycling path
407  * is the only one that's allowed to adjust the head.
408  *
409  * When we perform recycling we look at the current head and its corresponding
410  * tx buffer. There will always be a tx buffer in the same index in the
411  * itr_work_list[] unless a serious programmer error has occurred. This buffer
412  * will tell us what the index to check for completion is via its itb_last_desc
413  * member (only valid when itb_first is set to true). If this index indicates
414  * that it has been processed by hardware, then we process all entries between
415  * here and there.
416  *
417  * When we process descriptors, we bunch up the transmit descriptors and
418  * mblk_t's. We'll reset the transmit descriptor (freeing any DMA binding if
419  * used) and append the mblk_t if it exists to be freed in one large
420  * freemsgchain() at the end. The fact that we won't free any tx buffers
421  * associated with a packet until they're all done is important. This makes
422  * sure that any memory that we have bound from the mblk_t remains valid the
423  * entire time.
424  *
425  * If we have freed enough descriptors as part of this to allow mac to send data
426  * again, then once we have finished all processing and dropped the lock, we
427  * will notify MAC.
428  *
429  * When we are processing descriptors here we try to avoid holding the itr_lock
430  * except for the start and end of the process. This is an important way to
431  * ensure that we don't block transmits. Because of this, there can only be one
432  * thread performing a recycle at any given time between the interrupt path and
433  * the transmit path trying to clean up. This is maintained using the
434  * 'itr_recycle' boolean. If a recycle is already in progress then there's
435  * generally not much reason to perform one simultaneously and so the caller
436  * will just return. This is why the head (and thus returning descriptors) is
437  * only used by the recycle path.
438  *
439  * -------
440  * Locking
441  * -------
442  *
443  * Mutexes exist on three different structures in the driver:
444  *
445  * 1) igc_t (igc_lock)
446  * 2) igc_rx_ring_t (irr_lock, irr_free_lock)
447  * 3) igc_tx_ring_t (itr_lock)
448  *
449  * The following rules hold for locking in the driver:
450  *
451  * 1) One should not hold locks for both the rx rings and tx rings at the same
452  *    time. If this is required, please determine if it is absolutely necessary.
453  * 2) You should always take the controller's lock ahead of any ring's locks.
454  * 3) The general rx ring lock (irr_lock) should be taken ahead of the free list
455  *    lock (irr_free_lock) if both are required.
456  *
457  * -------------------
458  * Future Improvements
459  * -------------------
460  *
461  * This driver was initially written with an eye towards getting something that
462  * had broad use for folks with this hardware and not towards enabling every
463  * feature immediately. Here are some areas that can be improved upon in the
464  * driver.
465  *
466  *  - Multiple ring, RSS support: As the OS changes towards offering more
467  *    interrupts or opting to participate in IRM, then you can more easily
468  *    offer RSS and related features. This should likely show up as a single
469  *    rx group with multiple rings and leverage the tx pseudo-group support.
470  *
471  *  - TCP segmentation offload support: Right now the driver does not support
472  *    TSO. It'd potentially be a useful addition and help out folks. Fetching
473  *    information for TSO is in the tx data path right now.
474  *
475  *  - FMA Support: Currently the driver does not rig up support for FMA.
476  *    Participating in that and more generally being able to reset the device
477  *    while it is operating in the face of fatal errors would be good.
478  *
479  *  - TX stall detection: Related to the above, carefully designing a tx stall
480  *    detection and resetting the device when that happens would probably be
481  *    useful.
482  *
483  *  - UFM support: Exposing the NVM and PBA (printed board assembly) through the
484  *    UFM subsystem would be a good thing to do.
485  *
486  *  - Dynamic MTU changing: Right now the driver takes advantage of the
487  *    simplification of not allowing the MTU to change once the device has been
488  *    started. This isn't great, but it is far from the first (igb, e1000g,
489  *    ixgbe, etc.) to do this. It would be nice if this was lifted.
490  */
491 
492 #include <sys/ddi.h>
493 #include <sys/sunddi.h>
494 #include <sys/conf.h>
495 #include <sys/devops.h>
496 #include <sys/modctl.h>
497 #include <sys/cmn_err.h>
498 #include <sys/pci.h>
499 #include <sys/sysmacros.h>
500 #include <sys/debug.h>
501 #include <sys/bitext.h>
502 
503 #include "igc.h"
504 
505 /*
506  * The core code expects the igc_mcast_raw to be a uint8_t packed array. We use
507  * the ether_addr_t to make this a little more explicit and easy to reason
508  * about, but that means we are relying on this size.
509  */
510 CTASSERT(sizeof (ether_addr_t) == 6);
511 
512 uint32_t
513 igc_read32(igc_t *igc, uint32_t reg)
514 {
515 	uint32_t *addr;
516 	ASSERT3U(reg, <, igc->igc_regs_size);
517 	addr = (uint32_t *)(igc->igc_regs_base + reg);
518 	return (ddi_get32(igc->igc_regs_hdl, addr));
519 }
520 
521 void
522 igc_write32(igc_t *igc, uint32_t reg, uint32_t val)
523 {
524 	uint32_t *addr;
525 	ASSERT3U(reg, <, igc->igc_regs_size);
526 	addr = (uint32_t *)(igc->igc_regs_base + reg);
527 	ddi_put32(igc->igc_regs_hdl, addr, val);
528 }
529 
530 /*
531  * Ask hardware if the link is up and ready. Note, this assumes that we're on a
532  * copper phy and short circuits a few things. See igb_is_link_up() for what
533  * this looks like for non-copper PHYs if that ever becomes relevant.
534  */
535 static bool
536 igc_link_up(igc_t *igc)
537 {
538 	ASSERT(MUTEX_HELD(&igc->igc_lock));
539 
540 	/*
541 	 * When the link is up, then the core code will clear the value below.
542 	 * Otherwise we likely need to assume it's down.
543 	 */
544 	(void) igc_check_for_link(&igc->igc_hw);
545 	return (!igc->igc_hw.mac.get_link_status);
546 }
547 
548 static void
549 igc_intr_lsc(igc_t *igc)
550 {
551 	link_state_t orig_state, new_state;
552 	uint32_t mmd_base;
553 
554 	mutex_enter(&igc->igc_lock);
555 	orig_state = igc->igc_link_state;
556 
557 	/*
558 	 * Always force a check of the link.
559 	 */
560 	igc->igc_hw.mac.get_link_status = true;
561 	if (igc_link_up(igc)) {
562 		uint16_t duplex = 0;
563 
564 		(void) igc_get_speed_and_duplex(&igc->igc_hw,
565 		    &igc->igc_link_speed, &duplex);
566 
567 		switch (duplex) {
568 		case HALF_DUPLEX:
569 			igc->igc_link_duplex = LINK_DUPLEX_HALF;
570 			break;
571 		case FULL_DUPLEX:
572 			igc->igc_link_duplex = LINK_DUPLEX_FULL;
573 			break;
574 		default:
575 			igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
576 			break;
577 		}
578 		igc->igc_link_state = LINK_STATE_UP;
579 	} else {
580 		igc->igc_link_state = LINK_STATE_DOWN;
581 		igc->igc_link_speed = 0;
582 		igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
583 	}
584 	new_state = igc->igc_link_state;
585 
586 	/*
587 	 * Next, grab a bunch of information from the PHY for future us.
588 	 */
589 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_CONTROL, &igc->igc_phy_ctrl);
590 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_STATUS, &igc->igc_phy_status);
591 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_ADV,
592 	    &igc->igc_phy_an_adv);
593 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_LP_ABILITY,
594 	    &igc->igc_phy_lp);
595 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_EXP,
596 	    &igc->igc_phy_an_exp);
597 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_CTRL,
598 	    &igc->igc_phy_1000t_ctrl);
599 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_STATUS,
600 	    &igc->igc_phy_1000t_status);
601 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
602 	    &igc->igc_phy_ext_status);
603 	(void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
604 	    &igc->igc_phy_ext_status);
605 
606 	mmd_base = STANDARD_AN_REG_MASK << MMD_DEVADDR_SHIFT;
607 	(void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_CTRL,
608 	    &igc->igc_phy_mmd_ctrl);
609 	(void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_STS1,
610 	    &igc->igc_phy_mmd_sts);
611 	mutex_exit(&igc->igc_lock);
612 
613 	if (orig_state != new_state) {
614 		mac_link_update(igc->igc_mac_hdl, new_state);
615 	}
616 }
617 
618 static uint_t
619 igc_intr_rx_queue(caddr_t arg1, caddr_t arg2)
620 {
621 	igc_t *igc = (igc_t *)arg1;
622 	uintptr_t queue = (uintptr_t)arg2;
623 	igc_rx_ring_t *ring;
624 	mblk_t *mp = NULL;
625 
626 	ASSERT3U(queue, <, igc->igc_nrx_rings);
627 	ring = &igc->igc_rx_rings[queue];
628 
629 	mutex_enter(&ring->irr_lock);
630 	if ((ring->irr_flags & IGC_RXR_F_POLL) == 0) {
631 		mp = igc_ring_rx(ring, IGC_RX_POLL_INTR);
632 	}
633 	mutex_exit(&ring->irr_lock);
634 
635 	if (mp != NULL) {
636 		mac_rx_ring(igc->igc_mac_hdl, ring->irr_rh, mp, ring->irr_gen);
637 	}
638 
639 	return (DDI_INTR_CLAIMED);
640 }
641 
642 static uint_t
643 igc_intr_tx_other(caddr_t arg1, caddr_t arg2)
644 {
645 	igc_t *igc = (igc_t *)arg1;
646 	uint32_t icr = igc_read32(igc, IGC_ICR);
647 
648 	igc_tx_recycle(igc, &igc->igc_tx_rings[0]);
649 
650 	if ((icr & IGC_ICR_LSC) != 0) {
651 		igc_intr_lsc(igc);
652 	}
653 
654 	return (DDI_INTR_CLAIMED);
655 }
656 
657 static bool
658 igc_setup_regs(igc_t *igc)
659 {
660 	int ret;
661 	ddi_device_acc_attr_t da;
662 
663 	if (pci_config_setup(igc->igc_dip, &igc->igc_cfgspace) != DDI_SUCCESS) {
664 		dev_err(igc->igc_dip, CE_WARN, "failed to map config space");
665 		return (false);
666 	}
667 
668 	if (ddi_dev_regsize(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_size) !=
669 	    DDI_SUCCESS) {
670 		dev_err(igc->igc_dip, CE_WARN, "failed to get BAR %u size",
671 		    IGC_PCI_BAR - 1);
672 		return (false);
673 	}
674 
675 	bzero(&da, sizeof (ddi_device_acc_attr_t));
676 	da.devacc_attr_version = DDI_DEVICE_ATTR_V1;
677 	da.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC;
678 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
679 	da.devacc_attr_access = DDI_DEFAULT_ACC;
680 
681 	if ((ret = ddi_regs_map_setup(igc->igc_dip, IGC_PCI_BAR,
682 	    &igc->igc_regs_base, 0, igc->igc_regs_size, &da,
683 	    &igc->igc_regs_hdl)) != DDI_SUCCESS) {
684 		dev_err(igc->igc_dip, CE_WARN, "failed to map registers: %d",
685 		    ret);
686 		return (false);
687 	}
688 
689 	return (true);
690 }
691 
692 /*
693  * Go through the process of initializing the igc core code. First we have to
694  * fill in the information that the common code requires to identify the
695  * hardware and set the mac type. After that we can go through and set up all of
696  * the function initialization.
697  */
698 static bool
699 igc_core_code_init(igc_t *igc)
700 {
701 	int ret;
702 	int *regs;
703 	uint_t nprop;
704 
705 	igc->igc_hw.back = igc;
706 	igc->igc_hw.vendor_id = pci_config_get16(igc->igc_cfgspace,
707 	    PCI_CONF_VENID);
708 	igc->igc_hw.device_id = pci_config_get16(igc->igc_cfgspace,
709 	    PCI_CONF_DEVID);
710 	igc->igc_hw.revision_id = pci_config_get8(igc->igc_cfgspace,
711 	    PCI_CONF_REVID);
712 	igc->igc_hw.subsystem_vendor_id = pci_config_get16(igc->igc_cfgspace,
713 	    PCI_CONF_SUBVENID);
714 	igc->igc_hw.subsystem_device_id = pci_config_get16(igc->igc_cfgspace,
715 	    PCI_CONF_SUBSYSID);
716 
717 	if ((ret = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, igc->igc_dip,
718 	    DDI_PROP_DONTPASS, "reg", &regs, &nprop)) != DDI_PROP_SUCCESS) {
719 		dev_err(igc->igc_dip, CE_WARN, "failed to look up 'reg' "
720 		    "property: %d", ret);
721 		return (false);
722 	}
723 
724 	/*
725 	 * We fill out the function and command word. We currently don't fill
726 	 * out the bus type, speed, and width as it's not used by the common
727 	 * code, leaving it all at unknown. We can grab that information when it
728 	 * needs it. We do fill out the function and command word as the former
729 	 * is important and the latter is easy to grab.
730 	 */
731 	igc->igc_hw.bus.func = PCI_REG_FUNC_G(regs[0]);
732 	igc->igc_hw.bus.pci_cmd_word = pci_config_get16(igc->igc_cfgspace,
733 	    PCI_CONF_COMM);
734 	ddi_prop_free(regs);
735 
736 	/*
737 	 * The common code asks for the memory mapped address to be set in its
738 	 * structure. Though in theory it promises not to use it.
739 	 */
740 	igc->igc_hw.hw_addr = (uint8_t *)igc->igc_regs_base;
741 
742 	if ((ret = igc_set_mac_type(&igc->igc_hw)) != IGC_SUCCESS) {
743 		dev_err(igc->igc_dip, CE_WARN, "failed to set mac type: %d",
744 		    ret);
745 		return (false);
746 	}
747 
748 	if ((ret = igc_setup_init_funcs(&igc->igc_hw, true)) != IGC_SUCCESS) {
749 		dev_err(igc->igc_dip, CE_WARN, "failed to setup core code "
750 		    "function pointers: %d", ret);
751 		return (false);
752 	}
753 
754 	/*
755 	 * Go ahead and attempt to get the bus information even though this
756 	 * doesn't actually do anything right now.
757 	 */
758 	if ((ret = igc_get_bus_info(&igc->igc_hw)) != IGC_SUCCESS) {
759 		dev_err(igc->igc_dip, CE_WARN, "core code failed to get bus "
760 		    "info: %d", ret);
761 		return (false);
762 	}
763 
764 	return (true);
765 }
766 
767 static bool
768 igc_limits_init(igc_t *igc)
769 {
770 	switch (igc->igc_hw.mac.type) {
771 	case igc_i225:
772 		igc->igc_limits.il_max_rx_rings = IGC_MAX_RX_RINGS_I225;
773 		igc->igc_limits.il_max_tx_rings = IGC_MAX_RX_RINGS_I225;
774 		igc->igc_limits.il_max_mtu = IGC_MAX_MTU_I225;
775 		break;
776 	default:
777 		dev_err(igc->igc_dip, CE_WARN, "unknown MAC type: %u",
778 		    igc->igc_hw.mac.type);
779 		return (false);
780 	}
781 
782 	return (true);
783 }
784 
785 /*
786  * Determine the hardware buffer sizes that are required for the given MTU.
787  * There are a few different constraints that we try to enforce here that come
788  * from the hardware and others that come from us:
789  *
790  * 1) The hardware requires that the rx and tx sizes all be 1 KiB (0x400) byte
791  * aligned.
792  * 2) Our tx engine can handle copying across multiple descriptors, so we cap
793  * the maximum tx buffer size at one page.
794  * 3) Right now our rx engine does not handle scanning multiple buffers for rx
795  * (see the theory statement), so we end up making the rx buffer have to fix the
796  * maximum frame size.
797  * 4) rx buffers need to also account for IP alignment, so we make sure to
798  * allocate extra bytes for that.
799  */
800 void
801 igc_hw_buf_update(igc_t *igc)
802 {
803 	unsigned long pagesize = ddi_ptob(igc->igc_dip, 1);
804 	uint32_t tx_mtu;
805 
806 	igc->igc_max_frame = igc->igc_mtu + sizeof (struct ether_vlan_header) +
807 	    ETHERFCSL;
808 	igc->igc_rx_buf_size = P2ROUNDUP_TYPED(igc->igc_max_frame +
809 	    IGC_RX_BUF_IP_ALIGN, IGC_BUF_ALIGN, uint32_t);
810 	tx_mtu = P2ROUNDUP_TYPED(igc->igc_max_frame, IGC_BUF_ALIGN, uint32_t);
811 	igc->igc_tx_buf_size = MIN(tx_mtu, pagesize);
812 }
813 
814 static bool
815 igc_intr_init(igc_t *igc)
816 {
817 	int ret, types, nintrs, navail, req;
818 	const int min_nintrs = 2;
819 
820 	if ((ret = ddi_intr_get_supported_types(igc->igc_dip, &types)) !=
821 	    DDI_SUCCESS) {
822 		dev_err(igc->igc_dip, CE_WARN, "failed to get supported "
823 		    "interrupts: %d", ret);
824 		return (false);
825 	}
826 
827 	/*
828 	 * For now, we simplify our lives and device support by only supporting
829 	 * MSI-X interrupts. When we find versions of this without MSI-X
830 	 * support, we can go and add what we need.
831 	 */
832 	if ((types & DDI_INTR_TYPE_MSIX) == 0) {
833 		dev_err(igc->igc_dip, CE_WARN, "device does not support MSI-X, "
834 		    "found %d", types);
835 		return (false);
836 	}
837 
838 	if ((ret = ddi_intr_get_nintrs(igc->igc_dip, DDI_INTR_TYPE_MSIX,
839 	    &nintrs)) != DDI_SUCCESS) {
840 		dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
841 		    "supported MSI-X interrupts: %d", ret);
842 		return (false);
843 	}
844 
845 	if (nintrs < min_nintrs) {
846 		dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
847 		    "%d MSI-X interrupts be supported, found %d", min_nintrs,
848 		    nintrs);
849 		return (false);
850 	}
851 
852 	if ((ret = ddi_intr_get_navail(igc->igc_dip, DDI_INTR_TYPE_MSIX,
853 	    &navail)) != DDI_SUCCESS) {
854 		dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
855 		    "available MSI-X interrupts: %d", ret);
856 		return (false);
857 	}
858 
859 	if (navail < min_nintrs) {
860 		dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
861 		    "%d MSI-X interrupts be available, found %d", min_nintrs,
862 		    navail);
863 		return (false);
864 	}
865 
866 	/*
867 	 * In the future this could be based upon the multiple queues that the
868 	 * device supports, but for now it's limited to two. See 'Rings and
869 	 * Interrupts' in the theory statement for more background.
870 	 */
871 	req = min_nintrs;
872 	req = MIN(req, navail);
873 	igc->igc_intr_size = req * sizeof (ddi_intr_handle_t);
874 	igc->igc_intr_handles = kmem_alloc(igc->igc_intr_size, KM_SLEEP);
875 
876 	if ((ret = ddi_intr_alloc(igc->igc_dip, igc->igc_intr_handles,
877 	    DDI_INTR_TYPE_MSIX, 0, req, &igc->igc_nintrs,
878 	    DDI_INTR_ALLOC_NORMAL)) != DDI_SUCCESS) {
879 		dev_err(igc->igc_dip, CE_WARN, "failed to allocate interrupts: "
880 		    "%d", ret);
881 		return (false);
882 	}
883 
884 	igc->igc_intr_type = DDI_INTR_TYPE_MSIX;
885 	igc->igc_attach |= IGC_ATTACH_INTR_ALLOC;
886 	if (igc->igc_nintrs < min_nintrs) {
887 		dev_err(igc->igc_dip, CE_WARN, "received %d interrupts, but "
888 		    "needed at least %d", igc->igc_nintrs, min_nintrs);
889 		return (false);
890 	}
891 
892 	if ((ret = ddi_intr_get_pri(igc->igc_intr_handles[0],
893 	    &igc->igc_intr_pri)) != DDI_SUCCESS) {
894 		dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
895 		    "priority: %d", ret);
896 		return (false);
897 	}
898 
899 	if ((ret = ddi_intr_get_cap(igc->igc_intr_handles[0],
900 	    &igc->igc_intr_cap)) != DDI_SUCCESS) {
901 		dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
902 		    "capabilities: %d", ret);
903 		return (false);
904 	}
905 
906 	return (true);
907 }
908 
909 /*
910  * As part of allocating our rings we make the following assumptions about
911  * interrupt assignments. All tx rings share interrupt 0. All rx rings have
912  * separate interrupts starting from interrupt 1. This design may likely change
913  * in the face of actual multi-ring support
914  */
915 static bool
916 igc_rings_alloc(igc_t *igc)
917 {
918 	uint32_t intr = 0;
919 	igc->igc_tx_rings = kmem_zalloc(sizeof (igc_tx_ring_t) *
920 	    igc->igc_ntx_rings, KM_SLEEP);
921 
922 	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
923 		igc->igc_tx_rings[i].itr_igc = igc;
924 		igc->igc_tx_rings[i].itr_idx = i;
925 		igc->igc_tx_rings[i].itr_intr_idx = intr;
926 		mutex_init(&igc->igc_tx_rings[i].itr_lock, NULL, MUTEX_DRIVER,
927 		    DDI_INTR_PRI(igc->igc_intr_pri));
928 		if (!igc_tx_ring_stats_init(igc, &igc->igc_tx_rings[i])) {
929 			return (false);
930 		}
931 	}
932 
933 	igc->igc_rx_rings = kmem_zalloc(sizeof (igc_rx_ring_t) *
934 	    igc->igc_nrx_rings, KM_SLEEP);
935 	intr = 1;
936 
937 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++, intr++) {
938 		igc->igc_rx_rings[i].irr_igc = igc;
939 		igc->igc_rx_rings[i].irr_idx = i;
940 		igc->igc_rx_rings[i].irr_intr_idx = intr;
941 		mutex_init(&igc->igc_rx_rings[i].irr_lock, NULL, MUTEX_DRIVER,
942 		    DDI_INTR_PRI(igc->igc_intr_pri));
943 		mutex_init(&igc->igc_rx_rings[i].irr_free_lock, NULL,
944 		    MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri));
945 		cv_init(&igc->igc_rx_rings[i].irr_free_cv, NULL, CV_DRIVER,
946 		    NULL);
947 		if (!igc_rx_ring_stats_init(igc, &igc->igc_rx_rings[i])) {
948 			return (false);
949 		}
950 	}
951 
952 	ASSERT3U(intr, ==, igc->igc_nintrs);
953 
954 	return (true);
955 }
956 
957 /*
958  * Allocate our interrupts. Note, we have more or less constrained the device
959  * right now to only request two interrupts which we use in a fixed way. If we
960  * end up with more varied queue support then this should be changed around.
961  */
962 static bool
963 igc_intr_hdlr_init(igc_t *igc)
964 {
965 	int ret;
966 
967 	if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[0],
968 	    igc_intr_tx_other, igc, NULL)) != DDI_SUCCESS) {
969 		dev_err(igc->igc_dip, CE_WARN, "failed to add tx/other "
970 		    "interrupt handler: %d", ret);
971 		return (false);
972 	}
973 
974 	if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[1],
975 	    igc_intr_rx_queue, igc, (uintptr_t)0)) != DDI_SUCCESS) {
976 		dev_err(igc->igc_dip, CE_WARN, "failed to add rx interrupt "
977 		    "handler: %d", ret);
978 		if ((ret = ddi_intr_remove_handler(igc->igc_intr_handles[0])) !=
979 		    DDI_SUCCESS) {
980 			dev_err(igc->igc_dip, CE_WARN, "failed to remove "
981 			    "tx/other interrupt handler");
982 		}
983 		return (false);
984 	}
985 
986 	return (true);
987 }
988 
989 static void
990 igc_hw_control(igc_t *igc, bool take)
991 {
992 	uint32_t ctrl = igc_read32(igc, IGC_CTRL_EXT);
993 
994 	if (take) {
995 		ctrl |= IGC_CTRL_EXT_DRV_LOAD;
996 	} else {
997 		ctrl &= ~IGC_CTRL_EXT_DRV_LOAD;
998 	}
999 
1000 	igc_write32(igc, IGC_CTRL_EXT, ctrl);
1001 }
1002 
1003 /*
1004  * Basic device initialization and sanity check. This covers that we can
1005  * properly reset the device, validate its checksum, and get a valid MAC
1006  * address.
1007  */
1008 static bool
1009 igc_hw_init(igc_t *igc)
1010 {
1011 	int ret;
1012 	uint32_t eecd;
1013 
1014 	if ((ret = igc_reset_hw(&igc->igc_hw)) != IGC_SUCCESS) {
1015 		dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
1016 		    ret);
1017 		return (false);
1018 	}
1019 
1020 	/*
1021 	 * Goodbye firmware.
1022 	 */
1023 	igc_hw_control(igc, true);
1024 
1025 	/*
1026 	 * Check the NVM validiity if a device is present.
1027 	 */
1028 	eecd = igc_read32(igc, IGC_EECD);
1029 	if ((eecd & IGC_EECD_EE_DET) != 0) {
1030 		if ((ret = igc_validate_nvm_checksum(&igc->igc_hw)) !=
1031 		    IGC_SUCCESS) {
1032 			dev_err(igc->igc_dip, CE_WARN, "failed to validate "
1033 			    "igc NVM checksum: %d", ret);
1034 			return (false);
1035 		}
1036 	}
1037 
1038 	if ((ret = igc_read_mac_addr(&igc->igc_hw)) != IGC_SUCCESS) {
1039 		dev_err(igc->igc_dip, CE_WARN, "failed to read MAC address: %d",
1040 		    ret);
1041 		return (false);
1042 	}
1043 
1044 	if ((ret = igc_get_phy_id(&igc->igc_hw)) != IGC_SUCCESS) {
1045 		dev_err(igc->igc_dip, CE_WARN, "failed to get PHY id: %d", ret);
1046 		return (false);
1047 	}
1048 
1049 	return (true);
1050 }
1051 
1052 /*
1053  * In case the user has modified the LED state through MAC_CAPAB_LED, restore
1054  * that back to the defaults we got when we started up the device.
1055  */
1056 static void
1057 igc_led_fini(igc_t *igc)
1058 {
1059 	igc_write32(igc, IGC_LEDCTL, igc->igc_ledctl);
1060 }
1061 
1062 /*
1063  * Traditionally the Intel NIC drivers avoid touching activity pins as part of
1064  * their behavior for what we use. We also don't touch a pin if it's in SDP mode
1065  * and not being used to drive an LED as it means it's likely not for us.
1066  */
1067 static bool
1068 igc_led_ignore(i225_led_mode_t mode)
1069 {
1070 	switch (mode) {
1071 	case I225_LED_M_FILTER_ACT:
1072 	case I225_LED_M_LINK_ACT:
1073 	case I225_LED_M_SDP:
1074 	case I225_LED_M_PAUSE:
1075 	case I225_LED_M_ACT:
1076 		return (true);
1077 	default:
1078 		return (false);
1079 	}
1080 }
1081 
1082 static inline uint32_t
1083 igc_led_bitoff(uint32_t led)
1084 {
1085 	VERIFY3U(led, <, 3);
1086 	return (led * 8);
1087 }
1088 
1089 static inline uint32_t
1090 igc_led_get_mode(uint32_t led, uint32_t reg)
1091 {
1092 	uint32_t off = igc_led_bitoff(led);
1093 	return (bitx32(reg, 3 + off, off));
1094 }
1095 
1096 static inline uint32_t
1097 igc_led_set_mode(uint32_t led, uint32_t reg, i225_led_mode_t mode)
1098 {
1099 	uint32_t off = igc_led_bitoff(led);
1100 	return (bitset32(reg, 3 + off, off, mode));
1101 }
1102 
1103 static inline uint32_t
1104 igc_led_get_ivrt(uint32_t led, uint32_t reg)
1105 {
1106 	uint32_t off = igc_led_bitoff(led) + 6;
1107 	return (bitx32(reg, off, off));
1108 }
1109 
1110 static inline uint32_t
1111 igc_led_set_blink(uint32_t led, uint32_t reg, bool en)
1112 {
1113 	uint32_t off = igc_led_bitoff(led) + 7;
1114 	return (bitset32(reg, off, off, en));
1115 }
1116 
1117 /*
1118  * There are three LEDs on the chip. The reference defines LED0 for 1 GbE link
1119  * up, LED1 for a 2.5GbE link up, and LED 2 for activity. However, this is all
1120  * controllable in the NVM so we shouldn't assume that these have any of their
1121  * default values. We instead read the LEDCTL register to see how it was set up
1122  * by default (though the NVM would likely be better). We then create pre-canned
1123  * LEDCTL register values for on, off, and default. See igc_osdep.h for some of
1124  * the caveats in definitions here. Note, we only tweak the non-activity LEDs
1125  * and if an LED has been indicated that it's being used for SDP, we don't touch
1126  * it.
1127  */
1128 static void
1129 igc_led_init(igc_t *igc)
1130 {
1131 	uint32_t led = igc_read32(igc, IGC_LEDCTL);
1132 
1133 	igc->igc_ledctl = led;
1134 	igc->igc_ledctl_on = led;
1135 	igc->igc_ledctl_off = led;
1136 	igc->igc_ledctl_blink = led;
1137 
1138 	for (uint32_t i = 0; i < IGC_I225_NLEDS; i++) {
1139 		i225_led_mode_t mode = igc_led_get_mode(i, led);
1140 		if (!igc_led_ignore(mode)) {
1141 			/*
1142 			 * If the inversion logic is on, that changes what the
1143 			 * on and off modes mean, so we need to change how we
1144 			 * set that appropriately.
1145 			 */
1146 			if (igc_led_get_ivrt(i, led) != 0) {
1147 				igc->igc_ledctl_on = igc_led_set_mode(i,
1148 				    igc->igc_ledctl_on, I225_LED_M_OFF);
1149 				igc->igc_ledctl_off = igc_led_set_mode(i,
1150 				    igc->igc_ledctl_off, I225_LED_M_ON);
1151 				igc->igc_ledctl_blink = igc_led_set_mode(i,
1152 				    igc->igc_ledctl_blink, I225_LED_M_OFF);
1153 			} else {
1154 				igc->igc_ledctl_on = igc_led_set_mode(i,
1155 				    igc->igc_ledctl_on, I225_LED_M_ON);
1156 				igc->igc_ledctl_off = igc_led_set_mode(i,
1157 				    igc->igc_ledctl_off, I225_LED_M_OFF);
1158 				igc->igc_ledctl_blink = igc_led_set_mode(i,
1159 				    igc->igc_ledctl_blink, I225_LED_M_ON);
1160 			}
1161 		}
1162 
1163 		igc->igc_ledctl_blink = igc_led_set_blink(i,
1164 		    igc->igc_ledctl_blink, true);
1165 	}
1166 
1167 	igc->igc_led_mode = MAC_LED_DEFAULT;
1168 }
1169 
1170 static void
1171 igc_write_ivar(igc_t *igc, uint32_t queue, bool rx, uint32_t msix)
1172 {
1173 	const uint32_t ivarno = queue >> 1;
1174 	const uint32_t reg = IGC_IVAR0 + ivarno * 4;
1175 	const uint32_t val = msix | IGC_IVAR_VALID;
1176 	uint32_t bitoff, bitend, ivar;
1177 
1178 	if (rx) {
1179 		if ((queue % 2) == 0) {
1180 			bitoff = IGC_IVAR_RX0_START;
1181 		} else {
1182 			bitoff = IGC_IVAR_RX1_START;
1183 		}
1184 	} else {
1185 		if ((queue % 2) == 0) {
1186 			bitoff = IGC_IVAR_TX0_START;
1187 		} else {
1188 			bitoff = IGC_IVAR_TX1_START;
1189 		}
1190 	}
1191 	bitend = bitoff + IGC_IVAR_ENT_LEN - 1;
1192 
1193 	ivar = igc_read32(igc, reg);
1194 	ivar = bitset32(ivar, bitend, bitoff, val);
1195 	igc_write32(igc, reg, ivar);
1196 	igc->igc_eims |= 1 << msix;
1197 }
1198 
1199 /*
1200  * Here we need to go through and initialize the hardware's notion of how
1201  * interrupts are mapped to causes. The device must be specifically enabled for
1202  * MSI-X and then this is also where we go ensure that all of our interrupt
1203  * coalescing is properly enabled. Note, we must first touch the GPIE register
1204  * to enable MSI-X settings otherwise later settings won't do anything.
1205  */
1206 static void
1207 igc_hw_intr_init(igc_t *igc)
1208 {
1209 	uint32_t gpie, ivar;
1210 
1211 	gpie = IGC_GPIE_NSICR | IGC_GPIE_MSIX_MODE | IGC_GPIE_EIAME |
1212 	    IGC_GPIE_PBA;
1213 	igc_write32(igc, IGC_GPIE, gpie);
1214 
1215 	/*
1216 	 * Other causes are always explicitly mapped to cause 0. Each ring then
1217 	 * has its own mapping. In the MISC IVAR, these start at bit 8. We leave
1218 	 * the '0 |' out below just to avoid a compiler complaining. We also
1219 	 * must unamsk this interrupt cause, which is in bit 0.
1220 	 */
1221 	ivar = IGC_IVAR_VALID << 8;
1222 	igc_write32(igc, IGC_IVAR_MISC, ivar);
1223 	igc->igc_eims = 1;
1224 
1225 	/*
1226 	 * There are a few IVAR registers available in hardware. Each IVAR
1227 	 * register handles mapping a given queue to an MSI-X. Each IVAR handles
1228 	 * two queues.
1229 	 */
1230 	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
1231 		igc_write_ivar(igc, i, false,
1232 		    igc->igc_tx_rings[i].itr_intr_idx);
1233 	}
1234 
1235 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
1236 		igc_write_ivar(igc, i, true, igc->igc_rx_rings[i].irr_intr_idx);
1237 	}
1238 
1239 	for (uint32_t i = 0; i < igc->igc_nintrs; i++) {
1240 		igc_write32(igc, IGC_EITR(i), igc->igc_eitr);
1241 	}
1242 }
1243 
1244 /*
1245  * Synchronize our sense of the unicast table over to the device. If this is the
1246  * first time that we're here due to attach, we need to go through and allocate
1247  * the tracking table.
1248  */
1249 static void
1250 igc_unicast_sync(igc_t *igc)
1251 {
1252 	ASSERT(MUTEX_HELD(&igc->igc_lock));
1253 
1254 	if (igc->igc_ucast == NULL) {
1255 		igc->igc_nucast = igc->igc_hw.mac.rar_entry_count;
1256 		igc->igc_ucast = kmem_zalloc(sizeof (igc_addr_t) *
1257 		    igc->igc_nucast, KM_SLEEP);
1258 	}
1259 
1260 	for (uint16_t i = 0; i < igc->igc_nucast; i++) {
1261 		int ret = igc_rar_set(&igc->igc_hw, igc->igc_ucast[i].ia_mac,
1262 		    i);
1263 		/*
1264 		 * Common code today guarantees this can't fail. Put this here
1265 		 * to ensure to guard against future updates.
1266 		 */
1267 		VERIFY3S(ret, ==, IGC_SUCCESS);
1268 	}
1269 
1270 }
1271 
1272 /*
1273  * The core code interface to the multicast table requires us to give them a
1274  * packed uint8_t array that they manually walk through in ETHERADDRL (6 byte)
1275  * chunks. This must be packed. To deal with this we opt to preserve a normal
1276  * list of multicast addresses and then a secondary version that's serialized as
1277  * the core code wants it. We allocate the memory for this secondary version at
1278  * the start.
1279  */
1280 void
1281 igc_multicast_sync(igc_t *igc)
1282 {
1283 	uint16_t nvalid;
1284 
1285 	ASSERT(MUTEX_HELD(&igc->igc_lock));
1286 
1287 	if (igc->igc_mcast == NULL) {
1288 		igc->igc_nmcast = igc->igc_hw.mac.mta_reg_count;
1289 		igc->igc_mcast = kmem_zalloc(sizeof (igc_addr_t) *
1290 		    igc->igc_nmcast, KM_SLEEP);
1291 		igc->igc_mcast_raw = kmem_alloc(sizeof (ether_addr_t) *
1292 		    igc->igc_nmcast, KM_SLEEP);
1293 	}
1294 
1295 	bzero(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast);
1296 	nvalid = 0;
1297 	for (uint16_t i = 0; i < igc->igc_nmcast; i++) {
1298 		ether_addr_t *targ = &igc->igc_mcast_raw[nvalid];
1299 
1300 		if (!igc->igc_mcast[i].ia_valid)
1301 			continue;
1302 		bcopy(igc->igc_mcast[i].ia_mac, targ, sizeof (ether_addr_t));
1303 		nvalid++;
1304 	}
1305 
1306 	igc_update_mc_addr_list(&igc->igc_hw, (uint8_t *)igc->igc_mcast_raw,
1307 	    nvalid);
1308 }
1309 
1310 /*
1311  * This function is used to reinitialize the PBA, our various flow control
1312  * settings, reset hardware, ensure that the EEE, DPLU, and related power modes
1313  * are in the correct state.
1314  */
1315 bool
1316 igc_hw_common_init(igc_t *igc)
1317 {
1318 	int ret;
1319 	uint32_t pba, hwm, hwmp, hwm2x;
1320 	struct igc_hw *hw = &igc->igc_hw;
1321 
1322 	/*
1323 	 * The PBA register determines which portion is used for the receive
1324 	 * buffers and which is used for the transmit buffers. This follows from
1325 	 * the I210 and reference drivers which use 34K as the default. We
1326 	 * currently leave the RXPBS and TXPBS at their power-on-reset defaults.
1327 	 *
1328 	 * We set the watermark based settings similar to igb, ensuring that we
1329 	 * have 16-byte granularity. The general guidelines from there was that
1330 	 * when it comes to automatic Ethernet PAUSE frame generation we should:
1331 	 *
1332 	 * - After an XOFF, you want to receive at least two frames. We use
1333 	 *   whichever is smaller of 9/10ths and two frames.
1334 	 * - The low water mark apparently wants to be closer to the high water
1335 	 *   mark.
1336 	 *
1337 	 * See igb_init_adapter() for more information. We basically use the
1338 	 * same calculation it did, given that the MAC is basically the same.
1339 	 */
1340 	pba = IGC_PBA_34K;
1341 	hwmp = (pba << 10) * 9 / 10;
1342 	hwm2x = (pba << 10) - 2 * igc->igc_max_frame;
1343 	hwm = MIN(hwmp, hwm2x);
1344 
1345 	hw->fc.high_water = hwm & 0xfffffff0;
1346 	hw->fc.low_water = igc->igc_hw.fc.high_water - 16;
1347 
1348 	/*
1349 	 * Use the suggested default pause time.
1350 	 */
1351 	hw->fc.pause_time = IGC_FC_PAUSE_TIME;
1352 	hw->fc.send_xon = true;
1353 
1354 	if ((ret = igc_reset_hw(hw)) != IGC_SUCCESS) {
1355 		dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
1356 		    ret);
1357 		return (false);
1358 	}
1359 
1360 	if ((ret = igc_init_hw(hw)) != IGC_SUCCESS) {
1361 		dev_err(igc->igc_dip, CE_WARN, "failed to init hardware: %d",
1362 		    ret);
1363 		return (false);
1364 	}
1365 
1366 	/*
1367 	 * Clear wake on LAN and set other power states. In addition, disable
1368 	 * EEE for now.
1369 	 */
1370 	igc_write32(igc, IGC_WUC, 0);
1371 
1372 	if ((ret = igc_set_d0_lplu_state(hw, false)) != IGC_SUCCESS) {
1373 		dev_err(igc->igc_dip, CE_WARN, "failed to set D0 LPLU mode: %d",
1374 		    ret);
1375 		return (false);
1376 	}
1377 
1378 	/*
1379 	 * There have been reports that enabling EEE for some 2.5G devices has
1380 	 * led to issues with the I225/226. It's not entirely clear, but we
1381 	 * default to disabling this like in igb/e1000g for now.
1382 	 */
1383 	if ((ret = igc_set_eee_i225(hw, false, false, false)) != IGC_SUCCESS) {
1384 		dev_err(igc->igc_dip, CE_WARN, "failed to set EEE mode: %d",
1385 		    ret);
1386 		return (false);
1387 	}
1388 
1389 	igc_hw_intr_init(igc);
1390 
1391 	mutex_enter(&igc->igc_lock);
1392 	igc_unicast_sync(igc);
1393 	igc_multicast_sync(igc);
1394 
1395 	igc->igc_hw.mac.get_link_status = true;
1396 	(void) igc_get_phy_info(hw);
1397 	(void) igc_check_for_link(hw);
1398 	mutex_exit(&igc->igc_lock);
1399 
1400 	return (true);
1401 }
1402 
1403 static bool
1404 igc_intr_en(igc_t *igc)
1405 {
1406 	int ret;
1407 
1408 	if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
1409 		ret = ddi_intr_block_enable(igc->igc_intr_handles,
1410 		    igc->igc_nintrs);
1411 		if (ret != DDI_SUCCESS) {
1412 			dev_err(igc->igc_dip, CE_WARN, "failed to block "
1413 			    "enable interrupts: %d", ret);
1414 			return (false);
1415 		}
1416 	} else {
1417 		for (int i = 0; i < igc->igc_nintrs; i++) {
1418 			ret = ddi_intr_enable(igc->igc_intr_handles[i]);
1419 			if (ret != DDI_SUCCESS) {
1420 				dev_err(igc->igc_dip, CE_WARN, "failed to "
1421 				    "enable interrupt %d: %d", i, ret);
1422 				for (int clean = 0; clean < i; clean++) {
1423 					ret = ddi_intr_disable(
1424 					    igc->igc_intr_handles[clean]);
1425 					if (ret != DDI_SUCCESS) {
1426 						dev_err(igc->igc_dip, CE_WARN,
1427 						    "failed to disable "
1428 						    "interrupt %d while "
1429 						    "unwinding: %d", i, ret);
1430 					}
1431 				}
1432 				return (false);
1433 			}
1434 		}
1435 	}
1436 
1437 	/*
1438 	 * Now that we've enabled interrupts here, clear any pending interrupts
1439 	 * and make sure hardware interrupts are enabled.
1440 	 */
1441 	(void) igc_read32(igc, IGC_ICR);
1442 
1443 	return (true);
1444 }
1445 
1446 /*
1447  * Undo interrupt enablement.
1448  */
1449 void
1450 igc_hw_intr_disable(igc_t *igc)
1451 {
1452 	igc_write32(igc, IGC_EIMC, UINT32_MAX);
1453 	igc_write32(igc, IGC_EIAC, 0);
1454 	igc_write32(igc, IGC_IMC, UINT32_MAX);
1455 }
1456 
1457 /*
1458  * This is used during the GLDv3 mc_start(9E) entry point to enable interrupts
1459  * on the device itself.
1460  */
1461 void
1462 igc_hw_intr_enable(igc_t *igc)
1463 {
1464 	uint32_t ims;
1465 
1466 	/*
1467 	 * First we clear pending interrupts.
1468 	 */
1469 	(void) igc_read32(igc, IGC_ICR);
1470 
1471 	/*
1472 	 * The hardware has extended and non-extended interrupt masks and
1473 	 * auto-clear registers. We always disable auto-clear for the
1474 	 * non-extended portions. See the I210 datasheet 'Setting Interrupt
1475 	 * Registers' for a better sense of what's going on here.
1476 	 *
1477 	 * In the IMS register we always register link status change events and
1478 	 * device reset assertions.
1479 	 */
1480 	ims = IGC_IMS_LSC | IGC_IMS_DRSTA;
1481 
1482 	igc_write32(igc, IGC_EIAC, igc->igc_eims);
1483 	igc_write32(igc, IGC_EIMS, igc->igc_eims);
1484 	igc_write32(igc, IGC_IMS, ims);
1485 	igc_write32(igc, IGC_IAM, 0);
1486 }
1487 
1488 static void
1489 igc_cleanup(igc_t *igc)
1490 {
1491 	if (igc->igc_mcast != NULL) {
1492 		ASSERT3U(igc->igc_nmcast, !=, 0);
1493 		kmem_free(igc->igc_mcast_raw, sizeof (ether_addr_t) *
1494 		    igc->igc_nmcast);
1495 		kmem_free(igc->igc_mcast, sizeof (igc_addr_t) *
1496 		    igc->igc_nmcast);
1497 		igc->igc_nmcast = 0;
1498 		igc->igc_mcast = NULL;
1499 	}
1500 
1501 	if (igc->igc_ucast != NULL) {
1502 		ASSERT3U(igc->igc_nucast, !=, 0);
1503 		kmem_free(igc->igc_ucast, sizeof (igc_addr_t) *
1504 		    igc->igc_nucast);
1505 		igc->igc_nucast = 0;
1506 		igc->igc_ucast = NULL;
1507 	}
1508 
1509 	if ((igc->igc_attach & IGC_ATTACH_INTR_EN) != 0) {
1510 		int ret;
1511 		if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
1512 			ret = ddi_intr_block_disable(igc->igc_intr_handles,
1513 			    igc->igc_nintrs);
1514 			if (ret != DDI_SUCCESS) {
1515 				dev_err(igc->igc_dip, CE_WARN, "failed to "
1516 				    "block disable interrupts: %d", ret);
1517 			}
1518 		} else {
1519 			for (int i = 0; i < igc->igc_nintrs; i++) {
1520 				ret = ddi_intr_disable(
1521 				    igc->igc_intr_handles[i]);
1522 				if (ret != DDI_SUCCESS) {
1523 					dev_err(igc->igc_dip, CE_WARN, "failed "
1524 					    "to disable interrupt %d: %d", i,
1525 					    ret);
1526 				}
1527 			}
1528 		}
1529 		igc->igc_attach &= ~IGC_ATTACH_INTR_EN;
1530 	}
1531 
1532 	if ((igc->igc_attach & IGC_ATTACH_MAC) != 0) {
1533 		int ret = mac_unregister(igc->igc_mac_hdl);
1534 		if (ret != 0) {
1535 			dev_err(igc->igc_dip, CE_WARN, "failed to unregister "
1536 			    "MAC handle: %d", ret);
1537 		}
1538 		igc->igc_attach &= ~IGC_ATTACH_MAC;
1539 	}
1540 
1541 	if ((igc->igc_attach & IGC_ATTACH_STATS) != 0) {
1542 		igc_stats_fini(igc);
1543 		igc->igc_attach &= ~IGC_ATTACH_STATS;
1544 	}
1545 
1546 	if ((igc->igc_attach & IGC_ATTACH_LED) != 0) {
1547 		igc_led_fini(igc);
1548 		igc->igc_attach &= ~IGC_ATTACH_LED;
1549 	}
1550 
1551 	if ((igc->igc_attach & IGC_ATTACH_INTR_HANDLER) != 0) {
1552 		for (int i = 0; i < igc->igc_nintrs; i++) {
1553 			int ret =
1554 			    ddi_intr_remove_handler(igc->igc_intr_handles[i]);
1555 			if (ret != 0) {
1556 				dev_err(igc->igc_dip, CE_WARN, "failed to "
1557 				    "remove interrupt %d handler: %d", i, ret);
1558 			}
1559 		}
1560 		igc->igc_attach &= ~IGC_ATTACH_INTR_HANDLER;
1561 	}
1562 
1563 	if (igc->igc_tx_rings != NULL) {
1564 		for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
1565 			igc_tx_ring_stats_fini(&igc->igc_tx_rings[i]);
1566 			mutex_destroy(&igc->igc_tx_rings[i].itr_lock);
1567 		}
1568 		kmem_free(igc->igc_tx_rings, sizeof (igc_tx_ring_t) *
1569 		    igc->igc_ntx_rings);
1570 		igc->igc_tx_rings = NULL;
1571 	}
1572 
1573 	if (igc->igc_rx_rings != NULL) {
1574 		for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
1575 			igc_rx_ring_stats_fini(&igc->igc_rx_rings[i]);
1576 			cv_destroy(&igc->igc_rx_rings[i].irr_free_cv);
1577 			mutex_destroy(&igc->igc_rx_rings[i].irr_free_lock);
1578 			mutex_destroy(&igc->igc_rx_rings[i].irr_lock);
1579 		}
1580 		kmem_free(igc->igc_rx_rings, sizeof (igc_rx_ring_t) *
1581 		    igc->igc_nrx_rings);
1582 		igc->igc_rx_rings = NULL;
1583 	}
1584 
1585 	if ((igc->igc_attach & IGC_ATTACH_MUTEX) != 0) {
1586 		mutex_destroy(&igc->igc_lock);
1587 		igc->igc_attach &= ~IGC_ATTACH_MUTEX;
1588 	}
1589 
1590 	if ((igc->igc_attach & IGC_ATTACH_INTR_ALLOC) != 0) {
1591 		for (int i = 0; i < igc->igc_nintrs; i++) {
1592 			int ret = ddi_intr_free(igc->igc_intr_handles[i]);
1593 			if (ret != DDI_SUCCESS) {
1594 				dev_err(igc->igc_dip, CE_WARN, "unexpected "
1595 				    "failure freeing interrupt %d: %d", i, ret);
1596 			}
1597 		}
1598 		igc->igc_attach &= ~IGC_ATTACH_INTR_ALLOC;
1599 	}
1600 
1601 	if (igc->igc_intr_handles != NULL) {
1602 		ASSERT3U(igc->igc_intr_size, !=, 0);
1603 		kmem_free(igc->igc_intr_handles, igc->igc_intr_size);
1604 	}
1605 
1606 	/*
1607 	 * Now that we're almost done, begrudgingly let firmware know we're
1608 	 * done.
1609 	 */
1610 	igc_hw_control(igc, false);
1611 
1612 	if (igc->igc_regs_hdl != NULL) {
1613 		ddi_regs_map_free(&igc->igc_regs_hdl);
1614 		igc->igc_regs_base = NULL;
1615 	}
1616 
1617 	if (igc->igc_cfgspace != NULL) {
1618 		pci_config_teardown(&igc->igc_cfgspace);
1619 	}
1620 	igc->igc_attach &= ~IGC_ATTACH_REGS;
1621 
1622 	ddi_set_driver_private(igc->igc_dip, NULL);
1623 	igc->igc_dip = NULL;
1624 
1625 	VERIFY0(igc->igc_attach);
1626 
1627 	kmem_free(igc, sizeof (igc_t));
1628 }
1629 
1630 static int
1631 igc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1632 {
1633 	igc_t *igc;
1634 
1635 	if (cmd != DDI_ATTACH) {
1636 		return (DDI_FAILURE);
1637 	}
1638 
1639 	igc = kmem_zalloc(sizeof (igc_t), KM_SLEEP);
1640 	ddi_set_driver_private(dip, igc);
1641 	igc->igc_dip = dip;
1642 
1643 	/*
1644 	 * Initialize a few members that are not zero-based.
1645 	 */
1646 	igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
1647 	igc->igc_link_state = LINK_STATE_UNKNOWN;
1648 
1649 	/*
1650 	 * Set up all the register spaces that hardware requires.
1651 	 */
1652 	if (!igc_setup_regs(igc)) {
1653 		goto err;
1654 	}
1655 	igc->igc_attach |= IGC_ATTACH_REGS;
1656 
1657 	/*
1658 	 * Setup the common code.
1659 	 */
1660 	if (!igc_core_code_init(igc)) {
1661 		goto err;
1662 	}
1663 
1664 	if (!igc_limits_init(igc)) {
1665 		goto err;
1666 	}
1667 
1668 	/*
1669 	 * Go allocate and set up all of our interrupts.
1670 	 */
1671 	if (!igc_intr_init(igc)) {
1672 		goto err;
1673 	}
1674 
1675 	/*
1676 	 * Initialize our main mutex for the device now that we have an
1677 	 * interrupt priority.
1678 	 */
1679 	mutex_init(&igc->igc_lock, NULL, MUTEX_DRIVER,
1680 	    DDI_INTR_PRI(igc->igc_intr_pri));
1681 	igc->igc_attach |= IGC_ATTACH_MUTEX;
1682 
1683 	/*
1684 	 * We now want to determine the total number of rx and tx rings that we
1685 	 * have based on our interrupt allocation so we can go through and
1686 	 * perform the rest of the device setup that is required. The various
1687 	 * queues that we have are mapped to a given MSI-X through the IVAR
1688 	 * registers in the device. There is also an IVAR_MISC register that
1689 	 * maps link state change events and other issues up to two vectors.
1690 	 *
1691 	 * There isn't strictly per-queue interrupt generation control. Instead,
1692 	 * when in MSI-X mode, the device has an extended interrupt cause and
1693 	 * mask register. The mask register allows us to mask the five bits
1694 	 * described above.
1695 	 *
1696 	 * Because of all this we end up limiting the number of queues that we
1697 	 * use to 2 for now: 1 for tx and 1 for rx. Interrupt 0 is for tx/other
1698 	 * and 1 for rx.
1699 	 */
1700 	igc->igc_nrx_rings = 1;
1701 	igc->igc_ntx_rings = 1;
1702 
1703 	/*
1704 	 * Default to a 1500 byte MTU.
1705 	 */
1706 	igc->igc_mtu = ETHERMTU;
1707 	igc_hw_buf_update(igc);
1708 
1709 	/*
1710 	 * Initialize default descriptor limits and thresholds. We allocate 1.5
1711 	 * times the number of rx descriptors so that way we can loan up to
1712 	 * 1/3rd of them. We allocate an even number of tx descriptors.
1713 	 */
1714 	igc->igc_rx_ndesc = IGC_DEF_RX_RING_SIZE;
1715 	igc->igc_tx_ndesc = IGC_DEF_TX_RING_SIZE;
1716 	igc->igc_rx_nbuf = igc->igc_rx_ndesc + (igc->igc_rx_ndesc >> 1);
1717 	igc->igc_tx_nbuf = igc->igc_tx_ndesc;
1718 	igc->igc_rx_nfree = igc->igc_rx_nbuf - igc->igc_rx_ndesc;
1719 	igc->igc_rx_intr_nframes = IGC_DEF_RX_RING_INTR_LIMIT;
1720 	igc->igc_rx_bind_thresh = IGC_DEF_RX_BIND;
1721 	igc->igc_tx_bind_thresh = IGC_DEF_TX_BIND;
1722 	igc->igc_tx_notify_thresh = IGC_DEF_TX_NOTIFY_MIN;
1723 	igc->igc_tx_recycle_thresh = IGC_DEF_TX_RECYCLE_MIN;
1724 	igc->igc_tx_gap = IGC_DEF_TX_GAP;
1725 	igc->igc_eitr = IGC_DEF_EITR;
1726 
1727 	if (!igc_rings_alloc(igc)) {
1728 		goto err;
1729 	}
1730 
1731 	if (!igc_intr_hdlr_init(igc)) {
1732 		goto err;
1733 	}
1734 	igc->igc_attach |= IGC_ATTACH_INTR_HANDLER;
1735 
1736 	/*
1737 	 * Next reset the device before we begin initializing anything else. As
1738 	 * part of this, validate the flash checksum if present. This is all
1739 	 * initialization that we would only do once per device. Other
1740 	 * initialization that we want to do after any reset is done is
1741 	 * igc_hw_common_init().
1742 	 */
1743 	if (!igc_hw_init(igc)) {
1744 		goto err;
1745 	}
1746 
1747 	igc_led_init(igc);
1748 	igc->igc_attach |= IGC_ATTACH_LED;
1749 
1750 	/*
1751 	 * Snapshot our basic settings that users can eventually control in the
1752 	 * device. We start with always enabling auto-negotiation and
1753 	 * advertising the basic supported speeds. The I225v1 does have
1754 	 * substantial problems with enabling 2.5G due to the fact that it
1755 	 * doesn't maintain a proper inter-packet gap. Despite that, we default
1756 	 * to enabling 2.5G for now as its supposedly not broken with all link
1757 	 * partners and the NVM. We also don't have a way of actually
1758 	 * identifying and mapping that to something in the driver today,
1759 	 * unfortunately.
1760 	 */
1761 	igc->igc_hw.mac.autoneg = true;
1762 	igc->igc_hw.phy.autoneg_wait_to_complete = false;
1763 	igc->igc_hw.phy.autoneg_advertised = IGC_DEFAULT_ADV;
1764 	igc->igc_hw.fc.requested_mode = igc_fc_default;
1765 	igc->igc_hw.fc.current_mode = igc_fc_default;
1766 
1767 	if (!igc_hw_common_init(igc)) {
1768 		goto err;
1769 	}
1770 
1771 	if (!igc_stats_init(igc)) {
1772 		goto err;
1773 	}
1774 	igc->igc_attach |= IGC_ATTACH_STATS;
1775 
1776 	/*
1777 	 * Register with MAC
1778 	 */
1779 	if (!igc_mac_register(igc)) {
1780 		goto err;
1781 	}
1782 	igc->igc_attach |= IGC_ATTACH_MAC;
1783 
1784 	/*
1785 	 * Enable interrupts and get going.
1786 	 */
1787 	if (!igc_intr_en(igc)) {
1788 		goto err;
1789 	}
1790 	igc->igc_attach |= IGC_ATTACH_INTR_EN;
1791 
1792 	return (DDI_SUCCESS);
1793 
1794 err:
1795 	igc_cleanup(igc);
1796 	return (DDI_FAILURE);
1797 }
1798 
1799 static int
1800 igc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1801 {
1802 	igc_t *igc;
1803 
1804 	if (cmd != DDI_DETACH) {
1805 		return (DDI_FAILURE);
1806 	}
1807 
1808 	igc = ddi_get_driver_private(dip);
1809 	if (igc == NULL) {
1810 		dev_err(dip, CE_WARN, "asked to detach, but missing igc_t");
1811 		return (DDI_FAILURE);
1812 	}
1813 
1814 	igc_cleanup(igc);
1815 	return (DDI_SUCCESS);
1816 }
1817 
1818 static struct cb_ops igc_cb_ops = {
1819 	.cb_open = nulldev,
1820 	.cb_close = nulldev,
1821 	.cb_strategy = nodev,
1822 	.cb_print = nodev,
1823 	.cb_dump = nodev,
1824 	.cb_read = nodev,
1825 	.cb_write = nodev,
1826 	.cb_ioctl = nodev,
1827 	.cb_devmap = nodev,
1828 	.cb_mmap = nodev,
1829 	.cb_segmap = nodev,
1830 	.cb_chpoll = nochpoll,
1831 	.cb_prop_op = ddi_prop_op,
1832 	.cb_flag = D_MP,
1833 	.cb_rev = CB_REV,
1834 	.cb_aread = nodev,
1835 	.cb_awrite = nodev
1836 };
1837 
1838 static struct dev_ops igc_dev_ops = {
1839 	.devo_rev = DEVO_REV,
1840 	.devo_refcnt = 0,
1841 	.devo_getinfo = NULL,
1842 	.devo_identify = nulldev,
1843 	.devo_probe = nulldev,
1844 	.devo_attach = igc_attach,
1845 	.devo_detach = igc_detach,
1846 	.devo_reset = nodev,
1847 	.devo_quiesce = ddi_quiesce_not_supported,
1848 	.devo_cb_ops = &igc_cb_ops
1849 };
1850 
1851 static struct modldrv igc_modldrv = {
1852 	.drv_modops = &mod_driverops,
1853 	.drv_linkinfo = "Intel I226/226 Ethernet Controller",
1854 	.drv_dev_ops = &igc_dev_ops
1855 };
1856 
1857 static struct modlinkage igc_modlinkage = {
1858 	.ml_rev = MODREV_1,
1859 	.ml_linkage = { &igc_modldrv, NULL }
1860 };
1861 
1862 int
1863 _init(void)
1864 {
1865 	int ret;
1866 
1867 	mac_init_ops(&igc_dev_ops, IGC_MOD_NAME);
1868 
1869 	if ((ret = mod_install(&igc_modlinkage)) != 0) {
1870 		mac_fini_ops(&igc_dev_ops);
1871 	}
1872 
1873 	return (ret);
1874 }
1875 
1876 int
1877 _info(struct modinfo *modinfop)
1878 {
1879 	return (mod_info(&igc_modlinkage, modinfop));
1880 }
1881 
1882 int
1883 _fini(void)
1884 {
1885 	int ret;
1886 
1887 	if ((ret = mod_remove(&igc_modlinkage)) == 0) {
1888 		mac_fini_ops(&igc_dev_ops);
1889 	}
1890 
1891 	return (ret);
1892 }
1893