1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 /*
17 * Intel I225/226 Ethernet Driver
18 * ------------------------------
19 *
20 * This driver implements support for the Intel I225 and I226 Ethernet
21 * controllers which support up to 2.5 GbE and generally only supports BASE-T
22 * copper phys. This device is yet another variant on the venerable Intel 1 GbE
23 * devices that are found in e1000g(4D) and igb(4D). This is its own driver in
24 * part because that's how Intel did things and refactored their common code
25 * which we import and is found in the 'core' directory.
26 *
27 * There is not a good datasheet for the MAC that we've been able to find for
28 * this part. It's not clear that Intel even has a doc for this in their
29 * Resource and Design Center. The primary datasheet documents the NVM and other
30 * parts of it, but not the software interface. Based on observations from the
31 * common code we describe this as somewhat of an evolution of the I217 and
32 * I210, with less features than the I210, which comes from the server world
33 * (which ws itself a more stripped down I350).
34 *
35 * The result of all this is us trying to focus on what we know about this part
36 * and making some assumptions along the way. This includes things like:
37 *
38 * 1) We believe that the device only supports up to 4 RX and TX queues.
39 * 2) There is only one TX context for each TX queue and it is mapped to the
40 * queue.
41 * 3) There is no support for the head writeback modes that we've found.
42 * 4) This does otherwise support both the MSI-X and MSI/INTx interrupt
43 * management which are shaped very differently in the device.
44 * 5) The 2500BASE-T PHY support is unique, but the other PHY settings are
45 * roughly the same as far as we can tell.
46 *
47 * There are certainly more differences than the points up above, but the above
48 * are ones that generally influence our design.
49 *
50 * ------------
51 * Organization
52 * ------------
53 *
54 * This driver is first broken into two different pieces. There is the 'core'
55 * code which we import from Intel via FreeBSD. All of these sources are in the
56 * 'uts/common/io/igc/core' directory and we try our hardest to avoid modifying
57 * them (hence the smatch gags). The core code can be thought of as abstracting
58 * the MAC, NVM, and PHY across different chipsets (right now it's all the I225)
59 * and providing us with a series of library calls that we can do to manage the
60 * chip.
61 *
62 * The remaining files that sit alongside this one implement different portions
63 * of functionality related to the device. In particular:
64 *
65 * igc.[ch]: This is the main entry point for the driver and the
66 * source of this block comment. It implements all of the
67 * basic DDI entry points: attach and detach, interrupts,
68 * PCI config space and register set up and tear down.
69 *
70 * The header file contains all of the structure
71 * definitions that we use throughout this and the basic
72 * constants we use for sizing.
73 *
74 * igc_gld.c This file implements all of the major GLDv3 required
75 * entry points that are found in mac(9E). The guts of the
76 * actual I/O are in igc_ring.c, but getting and setting
77 * all of the various MAC properties and other bits is
78 * here.
79 *
80 * igc_osdep.[ch] The osdep (OS dependent) files, are used to define and
81 * implement the functionality required by the common code.
82 * igc_osdep.h is included in the build of each file.
83 *
84 * We have a second use for igc_osdep.h which is where we
85 * put missing hardware definitions that apply. This is for
86 * cases where the core code doesn't have it and it should
87 * really live in igc_defines.h or igc_regs.h, but we keep
88 * it here to avoid modifying those.
89 *
90 * igc_ring.c This implements the core I/O routines of the device
91 * driver, starting with the descriptor ring setup and tear
92 * down as well as DMA, descriptor ring, and per-frame
93 * memory. It also implements all of the primary logic for
94 * transmitting and receiving frames.
95 *
96 * igc_stat.c This file deals with kstat creation and destruction as
97 * well as reading and fetching all of the registers that
98 * exist in hardware.
99 *
100 * There are a few primary data structures to be aware of. Their relationships
101 * are shown in the following image and then described. Note, each structure has
102 * many more fields than those pictured:
103 *
104 * +---------------+
105 * | dev_info_t * |
106 * | -|-+
107 * | private data | |
108 * +---------------+ v
109 * +------------------------------+ +---------------------+
110 * | igc_t | | igc_addr_t |
111 * | per-instance primary | +---->| |
112 * | structure | |+--->| Notes a MAC address | ...
113 * | | || | stored in hardware |
114 * | igc_addr_t *igc_ucast -|--+| +---------------------+
115 * | igc_addr_t *igc_mcast -|---+ +---------------------------+
116 * | struct igc_hw *igc_hw -|--------->| struct igc_hw (core code) |
117 * | igc_tx_ring_t *igc_tx_rings -|--+ | |
118 * | igc_rx_ring_t *igc_rx_rings -|--|---+ | igc_mac_info mac |
119 * +------------------------------+ | | | igc_fc_info fc |
120 * | | | igc_phy_info phy |
121 * +----------------------------------+ | | igc_nvm_info nvm |
122 * | v +---------------------------+
123 * | +--------------------------------------+
124 * | | igc_rx_ring_t |
125 * | | |
126 * | | igc_adv_rx_desc *irr_ring ---|--> rx hw descriptor ring
127 * | | uint32_t irr_next ---|--> next entry to look for data
128 * | | igc_rx_buffer_t **irr_work_list ---|--> corresponds to ring entries
129 * | | uint32_t irr_nfree ---|--> number of free list entries
130 * | | igc_rx_buffer_t **irr_free_list ---|--> set of buffers free for bind
131 * | | igc_rx_buffer_t *irr_arena ---|-+> array of all rx buffers
132 * | +--------------------------------------+ |
133 * | |
134 * | +----------------------------+ |
135 * | | igc_rx_buffer_t |<--+
136 * | | |
137 * | | mblk_t *igb_mp -|---> mblk_t for rx buffer
138 * | | igc_dma_buffer_t irb_dma -|---> DMA memory for rx buffer
139 * | +----------------------------+
140 * |
141 * | +------------------------------------+
142 * +-->| igc_tx_ring_t |
143 * | |
144 * | icc_adv_tx_desc *itr_ring --|--> tx hw descriptor ring
145 * | uin32_t itr_ring_head --|--> next descriptor to recycle
146 * | uin32_t itr_ring_fail --|--> next descriptor to place
147 * | uin32_t itr_ring_free --|--> free descriptors in ring
148 * | igc_tx_buffer_t **itr_work_list |--> corresponds to ring entries
149 * | list_t itr_free_list --|--> available tx buffers
150 * | igc_tx_buffer_t *itr_arena --|-+> array of all tx buffers
151 * +------------------------------------+ |
152 * |
153 * +---------------------------------+ |
154 * | igc_tx_buffer_t |<-+
155 * | |
156 * | mblk_t *itb_mp --|--> mblk to tx (only in first)
157 * | igc_dma_buffer_t itb_dma --|--> tx DMA buffer for copy
158 * | ddi_dma_handle_t itb_bind_hdl --|--> DMA handle for bind
159 * +---------------------------------+
160 *
161 * igc_t This is the primary data structure that exists for each
162 * instance of the driver. There is generally a 1:1
163 * relationship between a physical port, an instance of the
164 * driver, and a PCI function. This structure provides
165 * access to the device's registers and it embeds the
166 * common code's struct igc_hw.
167 *
168 * struct igc_hw This structure is used by the core code and it contains
169 * information related to the MAC, PHY, NVM, and related
170 * information that the device uses. In general, this
171 * structure is used when performing API calls to the
172 * common code. The common code calls back into us in the
173 * igc_osdep.c interfaces.
174 *
175 * igc_tx_ring_t This structure represents a single transmit ring in
176 * hardware, its associated software state, and
177 * miscellaneous data like statistics, MAC handles, etc.
178 * See the 'TX Data Path Design' section for more
179 * information.
180 *
181 * igc_rx_ring_t This is the receive variant of a ring. It represents and
182 * tracks the hardware state along with all our metadata.
183 * One of these exists for each receive ring that we've
184 * enabled (currently one). See the 'RX Data Path Design'
185 * section for more information.
186 *
187 * igc_tx_buffer_t This represents a single tx buffer in the driver. A tx
188 * buffer contains DMA based storage that it can use to
189 * transmit a packet and contains a second DMA handle that
190 * can be used to bind a specific mblk_t to it. tx buffers
191 * are capped at the current page size and can be smaller
192 * if the maximum packet size is smaller. A 1500 byte MTU
193 * will end up with a 2 KiB buffer due to the device's
194 * internal alignment requirements.
195 *
196 * igc_rx_buffer_t This represents a single rx buffer in the driver. These
197 * buffers may be loaned up to MAC and then returned to us
198 * later. They contain a single DMA buffer which right now
199 * is a single contiguous buffer that fits the maximum
200 * packet size. Each buffer has a corresponding mblk_t that
201 * it is mapped to.
202 *
203 * igc_dma_buffer_t This represent a DMA buffer in the system. DMA buffers
204 * are used for transmit buffers, receive buffers, or
205 * various ring descriptor entries. The DMA buffer
206 * structure is not inherently limited to a specific number
207 * of cookies. It is always mapped in our virtual address
208 * space and encapsulates the various DDI functions. In
209 * general, one expects to interface with the idb_va member
210 * when needing to access the memory, the idb_size member
211 * when wanting to understand how much memory is in the
212 * buffer, and the idb_hdl member when needing to access
213 * the DMA cookies.
214 *
215 * igc_addr_t This represents a 48-bit Ethernet MAC address slot in
216 * the hardware that may or may not be used at any given
217 * point in time.
218 *
219 * --------------------
220 * Rings and Interrupts
221 * --------------------
222 *
223 * The I225/226 controller like the I210 supports up to 4 rx and tx rings. Due
224 * to the long history of this controller and its tradition from the e1000g/igb
225 * days and much older parts like the 8254x series, it has two entirely
226 * different sets of interrupt modes. One where MSI-X is used and a mode where
227 * a single MSI or INTx interrupt is used. Currently the driver only supports
228 * the MSI-X mode as that gives us more flexibility and due to the fact that the
229 * interrupt modes and register handling are different, reduces the complexity
230 * in the driver.
231 *
232 * The hardware uses its IVAR registers to map specific queues to interrupts.
233 * Each rx queue and tx queue is mapped to a specific bit position in the IVAR
234 * and there is an additional IVAR register for miscellaneous causes like link
235 * state changes. While the IVAR register allows for several bits for MSI-X
236 * entries, for the most part, it appears that there is only support for values
237 * in the range [0, 4] based on the I210 which we believe extends to the I225/6.
238 *
239 * MSI-X mode causes the device's various interrupt registers to be split into
240 * two groups the 'legacy' and 'extended' (sometimes called advanced) ones. The
241 * extended ones all start with 'E'. When in MSI-X mode, the EICR (cause), EICS
242 * (cause set), EIAC (auto-clear), EIMS (mask set) registers all operate with
243 * indexes that refer to the MSI-X. The primary way to temporarily disable
244 * interrupts for polling is to remove the given MSI-X from the auto-clear set
245 * and to clear it from the enabled mask set.
246 *
247 * The implication of all of this is that we can only really disable interrupts
248 * for polling on a per-MSI-X basis. This generally means that the design for
249 * interrupts and rings is that all the tx rings and the link state change
250 * events share interrupt 0, while rx rings use interrupts 1-4. Because the x86
251 * 'apix' modules end up defaulting to two interrupts to a driver, we end up
252 * only supporting a single rx and tx ring for the time being, though the driver
253 * is phrased in terms of a variable number of such rings.
254 *
255 * -------------------
256 * RX Data Path Design
257 * -------------------
258 *
259 * The rx data path is based around allocating a fixed number of receive buffers
260 * for each ring. We have two goals in the allocation buffer and ring design:
261 *
262 * 1) We want to make sure that the ring is always full of valid descriptors for
263 * rx to prevent stalls. One implication of this is that we will always
264 * refill a received buffer with a new one and notify the hardware that the
265 * buffer is usable again.
266 *
267 * 2) We would prefer to not have to copy received memory and instead bind the
268 * DMA memory directly into an mblk_t.
269 *
270 * To satisfy (1) we need to allocate at least as many rx buffers as there are
271 * ring entries. The ring is sized by default to 512 entries, which is a
272 * somewhat arbitrary, but common, size. We then say that we want to be able to
273 * loan half of our entries up the stack at any given time. This leads to us
274 * allocating 1.5x the ring size rx buffers.
275 *
276 * All of the rx buffers are stored in the irr_arena array. They are then split
277 * between the free list and the ring's work list. The work list is an array
278 * that is a 1:1 mapping to a location in the descriptor ring. That is index 4
279 * of the work list (irr_work_list[4]) corresponds to index 4 of the descriptor
280 * ring (irr_ring[4]). However, this may refer to any of the rx descriptors that
281 * is in the irr_arena. When we start up the ring, the first ring size entries
282 * are all inserted into the work list and then the remaining entries are
283 * inserted into the free list.
284 *
285 * Entries that are in the work list are always given to hardware. We track the
286 * next place for us to scan for received packets through the 'irr_next' index
287 * into the descriptor ring. When an interrupt fires, we start at irr_next and
288 * iterate through the descriptor ring continuing while we find valid, received
289 * packets. When we process a packet, we look at two things to consider whether
290 * we bind it or copy it to a new mblk_t. The first piece is the received
291 * packet's length. If the packet is small, there is not much value in binding
292 * it and instead we just allocate and copy a new buffer for the packet.
293 *
294 * The second is if there are free rx descriptors. To keep goal (1) valid, we
295 * only will loan a packet up if there is an entry on the free list that can
296 * replace the rx buffer, as otherwise we'd want to make sure we don't stall the
297 * ring. If an rx buffer is loaned, the entry on the free list takes its place
298 * in the descriptor ring and when the networking stack is finally done with the
299 * mblk_t, it'll be returned to us as part of the freemsg()/freeb() destructor.
300 * This lifetime is illustrated in the following diagram:
301 *
302 *
303 * +-------------+ +-----------+
304 * | Work List |<---*-------------------| Free List |
305 * | Owned by HW | . . Used to replace | Idle |
306 * +-------------+ loaned buffers +-----------+
307 * | | ^ ^
308 * | | . . . Reused if a |
309 * | +-+ copy is done . . . Returned to driver via
310 * | | freemsg() which calls
311 * | | igc_rx_recycle().
312 * v |
313 * +-------------------+ |
314 * | Loaned |------------------------+
315 * | Owned by netstack |
316 * +-------------------+
317 *
318 * Currently the rx data path uses rx buffers that are equal to the maximum size
319 * of a packet (rounded up based on hardware's 1 KiB alignment requirement).
320 * This was mostly done for initial simplicity, though it comes at a memory
321 * cost. It is possible to design this to be more like the tx subsystem where we
322 * use fixed page size buffers and just cons up an mblk_t chain with b_cont
323 * pointers.
324 *
325 * -------------------
326 * TX Data Path Design
327 * -------------------
328 *
329 * The tx data path is a bit different in design from the rx data path. When the
330 * system wants to tx data there are two fundamental building blocks that we
331 * use, both of which leverage the igc_tx_buffer_t:
332 *
333 * 1) We use the DMA memory that is allocated with the buffer and copy the
334 * mblk_t data into it. This is used when we have small mblk_t's.
335 *
336 * 2) We utilize the DMA handle that is in the tx buffer (but not the buffer's
337 * DMA memory) to perform DMA binding. This can result in multiple cookies
338 * and therefore descriptors mapping to the single buffer.
339 *
340 * Because a given tx buffer may end up using more than one descriptor and we
341 * have to account for transmit context descriptors, which are used for
342 * indicating checksum and segmentation offloads, we end up only allocating a
343 * number of transmit buffers equal to the ring size. In addition, the tx data
344 * buffer's maximum size is capped at the size of a single page. This is done
345 * because we often aren't going to be copying and if we are, we don't need that
346 * much more memory. The actual size may be smaller depending on the MTU.
347 *
348 * The tx descriptor ring is used in a bit of a different way. While part of the
349 * reason for this is that we are filling it based on the stack's demands and
350 * therefore only need to fill in descriptors when there's a need, the second
351 * reason is because of how the hardware reports back events. There are two
352 * major kinds of descriptors that can be entered into the ring. There are the
353 * aforementioned context descriptors and then data descriptors. While data
354 * descriptors support an interrupt on completion, context descriptors do not.
355 *
356 * When an mblk_t comes in to be transmitted, we walk all of the mblk_t's
357 * associated with it via the b_cont pointer. For each one, we look at the size
358 * of the data and determine whether or not to perform DMA binding or to copy it
359 * into the current tx buffer. A given tx buffer can be used to copy multiple
360 * different mblk_t's. Imagine a pathological case where we had a 500 byte
361 * packet split into 125 byte chunks, this would end up using a single tx data
362 * buffer. However, if you imagine a large chunk of TCP data, this may be
363 * spread across several mblk_t's so we may end up leveraging multiple tx data
364 * buffers.
365 *
366 * The transmit buffers that are available are stored on a free list. This is
367 * managed as a list_t as we end up needing to often track groups of descriptors
368 * to allocate and free across packet transmit and recycling. We don't count the
369 * number of transmit buffers that are free per se, but it generally tracks the
370 * number of free descriptors which do track as in the worst case there is a 1:1
371 * relationship between buffers and descriptors and more generally it's 1:n,
372 * that is there are multiple descriptors used for a single buffer.
373 *
374 * The transmit ring is managed through a combination of three integers, the
375 * itr_ring_head, the itr_ring_tail, and the itr_ring_free. The ring's tail
376 * represents the place where the driver will place new data to transmit. The
377 * ring's head represents the first place that we should check for a packet's
378 * completion when we're performing recycling (the act of acknowledging what
379 * hardware has processed internal to the driver) due to a tx interrupt or
380 * manual recycling in the transmit path.
381 *
382 * When placing a packet as a series of descriptor rings we'll end up doing the
383 * following:
384 *
385 * 1) First we determine how to map each mblk_t as mentioned above.
386 * 2) This will then be turned into descriptors in the ring. Each tx data buffer
387 * that is used is placed in the itr_work_list at the corresponding index
388 * that they are used in the ring. There is one special case here, if a
389 * context descriptor is used, the first transmit buffer will refer to the
390 * context descriptor's entry (which always comes before data).
391 * 3) We'll ensure that there are enough descriptors for this packet to fit into
392 * the ring or if it would exceed our mandatory gap threshold. If so, then
393 * we'll undo all the work we just did and return the mblk_t to MAC and
394 * indicate that the ring is blocked. MAC will be notified later when we free
395 * up transmit descriptors.
396 * 4) In the first transmit data buffer we'll store both the mblk_t and then
397 * we'll store what the index of the last descriptor that's used is. This is
398 * important for recycling. We also indicate that the last descriptor should
399 * be the one that reports its status on interrupt completion.
400 * 5) We'll notify hardware that there is data for it to transmit by writing to
401 * the ring's tail pointer.
402 *
403 * This all works reasonably okay, except for the small problem of the bill,
404 * which we pay off in the form of recycling. Recycling is going through the
405 * ring and seeing which descriptors are free. While the transmit path described
406 * above is the only path that is allowed to move the tail, the recycling path
407 * is the only one that's allowed to adjust the head.
408 *
409 * When we perform recycling we look at the current head and its corresponding
410 * tx buffer. There will always be a tx buffer in the same index in the
411 * itr_work_list[] unless a serious programmer error has occurred. This buffer
412 * will tell us what the index to check for completion is via its itb_last_desc
413 * member (only valid when itb_first is set to true). If this index indicates
414 * that it has been processed by hardware, then we process all entries between
415 * here and there.
416 *
417 * When we process descriptors, we bunch up the transmit descriptors and
418 * mblk_t's. We'll reset the transmit descriptor (freeing any DMA binding if
419 * used) and append the mblk_t if it exists to be freed in one large
420 * freemsgchain() at the end. The fact that we won't free any tx buffers
421 * associated with a packet until they're all done is important. This makes
422 * sure that any memory that we have bound from the mblk_t remains valid the
423 * entire time.
424 *
425 * If we have freed enough descriptors as part of this to allow mac to send data
426 * again, then once we have finished all processing and dropped the lock, we
427 * will notify MAC.
428 *
429 * When we are processing descriptors here we try to avoid holding the itr_lock
430 * except for the start and end of the process. This is an important way to
431 * ensure that we don't block transmits. Because of this, there can only be one
432 * thread performing a recycle at any given time between the interrupt path and
433 * the transmit path trying to clean up. This is maintained using the
434 * 'itr_recycle' boolean. If a recycle is already in progress then there's
435 * generally not much reason to perform one simultaneously and so the caller
436 * will just return. This is why the head (and thus returning descriptors) is
437 * only used by the recycle path.
438 *
439 * -------
440 * Locking
441 * -------
442 *
443 * Mutexes exist on three different structures in the driver:
444 *
445 * 1) igc_t (igc_lock)
446 * 2) igc_rx_ring_t (irr_lock, irr_free_lock)
447 * 3) igc_tx_ring_t (itr_lock)
448 *
449 * The following rules hold for locking in the driver:
450 *
451 * 1) One should not hold locks for both the rx rings and tx rings at the same
452 * time. If this is required, please determine if it is absolutely necessary.
453 * 2) You should always take the controller's lock ahead of any ring's locks.
454 * 3) The general rx ring lock (irr_lock) should be taken ahead of the free list
455 * lock (irr_free_lock) if both are required.
456 *
457 * -------------------
458 * Future Improvements
459 * -------------------
460 *
461 * This driver was initially written with an eye towards getting something that
462 * had broad use for folks with this hardware and not towards enabling every
463 * feature immediately. Here are some areas that can be improved upon in the
464 * driver.
465 *
466 * - Multiple ring, RSS support: As the OS changes towards offering more
467 * interrupts or opting to participate in IRM, then you can more easily
468 * offer RSS and related features. This should likely show up as a single
469 * rx group with multiple rings and leverage the tx pseudo-group support.
470 *
471 * - TCP segmentation offload support: Right now the driver does not support
472 * TSO. It'd potentially be a useful addition and help out folks. Fetching
473 * information for TSO is in the tx data path right now.
474 *
475 * - FMA Support: Currently the driver does not rig up support for FMA.
476 * Participating in that and more generally being able to reset the device
477 * while it is operating in the face of fatal errors would be good.
478 *
479 * - TX stall detection: Related to the above, carefully designing a tx stall
480 * detection and resetting the device when that happens would probably be
481 * useful.
482 *
483 * - UFM support: Exposing the NVM and PBA (printed board assembly) through the
484 * UFM subsystem would be a good thing to do.
485 *
486 * - Dynamic MTU changing: Right now the driver takes advantage of the
487 * simplification of not allowing the MTU to change once the device has been
488 * started. This isn't great, but it is far from the first (igb, e1000g,
489 * ixgbe, etc.) to do this. It would be nice if this was lifted.
490 */
491
492 #include <sys/ddi.h>
493 #include <sys/sunddi.h>
494 #include <sys/conf.h>
495 #include <sys/devops.h>
496 #include <sys/modctl.h>
497 #include <sys/cmn_err.h>
498 #include <sys/pci.h>
499 #include <sys/sysmacros.h>
500 #include <sys/debug.h>
501 #include <sys/bitext.h>
502
503 #include "igc.h"
504
505 /*
506 * The core code expects the igc_mcast_raw to be a uint8_t packed array. We use
507 * the ether_addr_t to make this a little more explicit and easy to reason
508 * about, but that means we are relying on this size.
509 */
510 CTASSERT(sizeof (ether_addr_t) == 6);
511
512 uint32_t
igc_read32(igc_t * igc,uint32_t reg)513 igc_read32(igc_t *igc, uint32_t reg)
514 {
515 uint32_t *addr;
516 ASSERT3U(reg, <, igc->igc_regs_size);
517 addr = (uint32_t *)(igc->igc_regs_base + reg);
518 return (ddi_get32(igc->igc_regs_hdl, addr));
519 }
520
521 void
igc_write32(igc_t * igc,uint32_t reg,uint32_t val)522 igc_write32(igc_t *igc, uint32_t reg, uint32_t val)
523 {
524 uint32_t *addr;
525 ASSERT3U(reg, <, igc->igc_regs_size);
526 addr = (uint32_t *)(igc->igc_regs_base + reg);
527 ddi_put32(igc->igc_regs_hdl, addr, val);
528 }
529
530 /*
531 * Ask hardware if the link is up and ready. Note, this assumes that we're on a
532 * copper phy and short circuits a few things. See igb_is_link_up() for what
533 * this looks like for non-copper PHYs if that ever becomes relevant.
534 */
535 static bool
igc_link_up(igc_t * igc)536 igc_link_up(igc_t *igc)
537 {
538 ASSERT(MUTEX_HELD(&igc->igc_lock));
539
540 /*
541 * When the link is up, then the core code will clear the value below.
542 * Otherwise we likely need to assume it's down.
543 */
544 (void) igc_check_for_link(&igc->igc_hw);
545 return (!igc->igc_hw.mac.get_link_status);
546 }
547
548 static void
igc_intr_lsc(igc_t * igc)549 igc_intr_lsc(igc_t *igc)
550 {
551 link_state_t orig_state, new_state;
552 uint32_t mmd_base;
553
554 mutex_enter(&igc->igc_lock);
555 orig_state = igc->igc_link_state;
556
557 /*
558 * Always force a check of the link.
559 */
560 igc->igc_hw.mac.get_link_status = true;
561 if (igc_link_up(igc)) {
562 uint16_t duplex = 0;
563
564 (void) igc_get_speed_and_duplex(&igc->igc_hw,
565 &igc->igc_link_speed, &duplex);
566
567 switch (duplex) {
568 case HALF_DUPLEX:
569 igc->igc_link_duplex = LINK_DUPLEX_HALF;
570 break;
571 case FULL_DUPLEX:
572 igc->igc_link_duplex = LINK_DUPLEX_FULL;
573 break;
574 default:
575 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
576 break;
577 }
578 igc->igc_link_state = LINK_STATE_UP;
579 } else {
580 igc->igc_link_state = LINK_STATE_DOWN;
581 igc->igc_link_speed = 0;
582 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
583 }
584 new_state = igc->igc_link_state;
585
586 /*
587 * Next, grab a bunch of information from the PHY for future us.
588 */
589 (void) igc_read_phy_reg(&igc->igc_hw, PHY_CONTROL, &igc->igc_phy_ctrl);
590 (void) igc_read_phy_reg(&igc->igc_hw, PHY_STATUS, &igc->igc_phy_status);
591 (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_ADV,
592 &igc->igc_phy_an_adv);
593 (void) igc_read_phy_reg(&igc->igc_hw, PHY_LP_ABILITY,
594 &igc->igc_phy_lp);
595 (void) igc_read_phy_reg(&igc->igc_hw, PHY_AUTONEG_EXP,
596 &igc->igc_phy_an_exp);
597 (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_CTRL,
598 &igc->igc_phy_1000t_ctrl);
599 (void) igc_read_phy_reg(&igc->igc_hw, PHY_1000T_STATUS,
600 &igc->igc_phy_1000t_status);
601 (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
602 &igc->igc_phy_ext_status);
603 (void) igc_read_phy_reg(&igc->igc_hw, PHY_EXT_STATUS,
604 &igc->igc_phy_ext_status);
605
606 mmd_base = STANDARD_AN_REG_MASK << MMD_DEVADDR_SHIFT;
607 (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_CTRL,
608 &igc->igc_phy_mmd_ctrl);
609 (void) igc_read_phy_reg(&igc->igc_hw, mmd_base | ANEG_MULTIGBT_AN_STS1,
610 &igc->igc_phy_mmd_sts);
611 mutex_exit(&igc->igc_lock);
612
613 if (orig_state != new_state) {
614 mac_link_update(igc->igc_mac_hdl, new_state);
615 }
616 }
617
618 static uint_t
igc_intr_rx_queue(caddr_t arg1,caddr_t arg2)619 igc_intr_rx_queue(caddr_t arg1, caddr_t arg2)
620 {
621 igc_t *igc = (igc_t *)arg1;
622 uintptr_t queue = (uintptr_t)arg2;
623 igc_rx_ring_t *ring;
624 mblk_t *mp = NULL;
625
626 ASSERT3U(queue, <, igc->igc_nrx_rings);
627 ring = &igc->igc_rx_rings[queue];
628
629 mutex_enter(&ring->irr_lock);
630 if ((ring->irr_flags & IGC_RXR_F_POLL) == 0) {
631 mp = igc_ring_rx(ring, IGC_RX_POLL_INTR);
632 }
633 mutex_exit(&ring->irr_lock);
634
635 if (mp != NULL) {
636 mac_rx_ring(igc->igc_mac_hdl, ring->irr_rh, mp, ring->irr_gen);
637 }
638
639 return (DDI_INTR_CLAIMED);
640 }
641
642 static uint_t
igc_intr_tx_other(caddr_t arg1,caddr_t arg2)643 igc_intr_tx_other(caddr_t arg1, caddr_t arg2)
644 {
645 igc_t *igc = (igc_t *)arg1;
646 uint32_t icr = igc_read32(igc, IGC_ICR);
647
648 igc_tx_recycle(igc, &igc->igc_tx_rings[0]);
649
650 if ((icr & IGC_ICR_LSC) != 0) {
651 igc_intr_lsc(igc);
652 }
653
654 return (DDI_INTR_CLAIMED);
655 }
656
657 static bool
igc_setup_regs(igc_t * igc)658 igc_setup_regs(igc_t *igc)
659 {
660 int ret;
661 ddi_device_acc_attr_t da;
662
663 if (pci_config_setup(igc->igc_dip, &igc->igc_cfgspace) != DDI_SUCCESS) {
664 dev_err(igc->igc_dip, CE_WARN, "failed to map config space");
665 return (false);
666 }
667
668 if (ddi_dev_regsize(igc->igc_dip, IGC_PCI_BAR, &igc->igc_regs_size) !=
669 DDI_SUCCESS) {
670 dev_err(igc->igc_dip, CE_WARN, "failed to get BAR %u size",
671 IGC_PCI_BAR - 1);
672 return (false);
673 }
674
675 bzero(&da, sizeof (ddi_device_acc_attr_t));
676 da.devacc_attr_version = DDI_DEVICE_ATTR_V1;
677 da.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC;
678 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
679 da.devacc_attr_access = DDI_DEFAULT_ACC;
680
681 if ((ret = ddi_regs_map_setup(igc->igc_dip, IGC_PCI_BAR,
682 &igc->igc_regs_base, 0, igc->igc_regs_size, &da,
683 &igc->igc_regs_hdl)) != DDI_SUCCESS) {
684 dev_err(igc->igc_dip, CE_WARN, "failed to map registers: %d",
685 ret);
686 return (false);
687 }
688
689 return (true);
690 }
691
692 /*
693 * Go through the process of initializing the igc core code. First we have to
694 * fill in the information that the common code requires to identify the
695 * hardware and set the mac type. After that we can go through and set up all of
696 * the function initialization.
697 */
698 static bool
igc_core_code_init(igc_t * igc)699 igc_core_code_init(igc_t *igc)
700 {
701 int ret;
702 int *regs;
703 uint_t nprop;
704
705 igc->igc_hw.back = igc;
706 igc->igc_hw.vendor_id = pci_config_get16(igc->igc_cfgspace,
707 PCI_CONF_VENID);
708 igc->igc_hw.device_id = pci_config_get16(igc->igc_cfgspace,
709 PCI_CONF_DEVID);
710 igc->igc_hw.revision_id = pci_config_get8(igc->igc_cfgspace,
711 PCI_CONF_REVID);
712 igc->igc_hw.subsystem_vendor_id = pci_config_get16(igc->igc_cfgspace,
713 PCI_CONF_SUBVENID);
714 igc->igc_hw.subsystem_device_id = pci_config_get16(igc->igc_cfgspace,
715 PCI_CONF_SUBSYSID);
716
717 if ((ret = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, igc->igc_dip,
718 DDI_PROP_DONTPASS, "reg", ®s, &nprop)) != DDI_PROP_SUCCESS) {
719 dev_err(igc->igc_dip, CE_WARN, "failed to look up 'reg' "
720 "property: %d", ret);
721 return (false);
722 }
723
724 /*
725 * We fill out the function and command word. We currently don't fill
726 * out the bus type, speed, and width as it's not used by the common
727 * code, leaving it all at unknown. We can grab that information when it
728 * needs it. We do fill out the function and command word as the former
729 * is important and the latter is easy to grab.
730 */
731 igc->igc_hw.bus.func = PCI_REG_FUNC_G(regs[0]);
732 igc->igc_hw.bus.pci_cmd_word = pci_config_get16(igc->igc_cfgspace,
733 PCI_CONF_COMM);
734 ddi_prop_free(regs);
735
736 /*
737 * The common code asks for the memory mapped address to be set in its
738 * structure. Though in theory it promises not to use it.
739 */
740 igc->igc_hw.hw_addr = (uint8_t *)igc->igc_regs_base;
741
742 if ((ret = igc_set_mac_type(&igc->igc_hw)) != IGC_SUCCESS) {
743 dev_err(igc->igc_dip, CE_WARN, "failed to set mac type: %d",
744 ret);
745 return (false);
746 }
747
748 if ((ret = igc_setup_init_funcs(&igc->igc_hw, true)) != IGC_SUCCESS) {
749 dev_err(igc->igc_dip, CE_WARN, "failed to setup core code "
750 "function pointers: %d", ret);
751 return (false);
752 }
753
754 /*
755 * Go ahead and attempt to get the bus information even though this
756 * doesn't actually do anything right now.
757 */
758 if ((ret = igc_get_bus_info(&igc->igc_hw)) != IGC_SUCCESS) {
759 dev_err(igc->igc_dip, CE_WARN, "core code failed to get bus "
760 "info: %d", ret);
761 return (false);
762 }
763
764 return (true);
765 }
766
767 static bool
igc_limits_init(igc_t * igc)768 igc_limits_init(igc_t *igc)
769 {
770 switch (igc->igc_hw.mac.type) {
771 case igc_i225:
772 igc->igc_limits.il_max_rx_rings = IGC_MAX_RX_RINGS_I225;
773 igc->igc_limits.il_max_tx_rings = IGC_MAX_RX_RINGS_I225;
774 igc->igc_limits.il_max_mtu = IGC_MAX_MTU_I225;
775 break;
776 default:
777 dev_err(igc->igc_dip, CE_WARN, "unknown MAC type: %u",
778 igc->igc_hw.mac.type);
779 return (false);
780 }
781
782 return (true);
783 }
784
785 /*
786 * Determine the hardware buffer sizes that are required for the given MTU.
787 * There are a few different constraints that we try to enforce here that come
788 * from the hardware and others that come from us:
789 *
790 * 1) The hardware requires that the rx and tx sizes all be 1 KiB (0x400) byte
791 * aligned.
792 * 2) Our tx engine can handle copying across multiple descriptors, so we cap
793 * the maximum tx buffer size at one page.
794 * 3) Right now our rx engine does not handle scanning multiple buffers for rx
795 * (see the theory statement), so we end up making the rx buffer have to fix the
796 * maximum frame size.
797 * 4) rx buffers need to also account for IP alignment, so we make sure to
798 * allocate extra bytes for that.
799 */
800 void
igc_hw_buf_update(igc_t * igc)801 igc_hw_buf_update(igc_t *igc)
802 {
803 unsigned long pagesize = ddi_ptob(igc->igc_dip, 1);
804 uint32_t tx_mtu;
805
806 igc->igc_max_frame = igc->igc_mtu + sizeof (struct ether_vlan_header) +
807 ETHERFCSL;
808 igc->igc_rx_buf_size = P2ROUNDUP_TYPED(igc->igc_max_frame +
809 IGC_RX_BUF_IP_ALIGN, IGC_BUF_ALIGN, uint32_t);
810 tx_mtu = P2ROUNDUP_TYPED(igc->igc_max_frame, IGC_BUF_ALIGN, uint32_t);
811 igc->igc_tx_buf_size = MIN(tx_mtu, pagesize);
812 }
813
814 static bool
igc_intr_init(igc_t * igc)815 igc_intr_init(igc_t *igc)
816 {
817 int ret, types, nintrs, navail, req;
818 const int min_nintrs = 2;
819
820 if ((ret = ddi_intr_get_supported_types(igc->igc_dip, &types)) !=
821 DDI_SUCCESS) {
822 dev_err(igc->igc_dip, CE_WARN, "failed to get supported "
823 "interrupts: %d", ret);
824 return (false);
825 }
826
827 /*
828 * For now, we simplify our lives and device support by only supporting
829 * MSI-X interrupts. When we find versions of this without MSI-X
830 * support, we can go and add what we need.
831 */
832 if ((types & DDI_INTR_TYPE_MSIX) == 0) {
833 dev_err(igc->igc_dip, CE_WARN, "device does not support MSI-X, "
834 "found %d", types);
835 return (false);
836 }
837
838 if ((ret = ddi_intr_get_nintrs(igc->igc_dip, DDI_INTR_TYPE_MSIX,
839 &nintrs)) != DDI_SUCCESS) {
840 dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
841 "supported MSI-X interrupts: %d", ret);
842 return (false);
843 }
844
845 if (nintrs < min_nintrs) {
846 dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
847 "%d MSI-X interrupts be supported, found %d", min_nintrs,
848 nintrs);
849 return (false);
850 }
851
852 if ((ret = ddi_intr_get_navail(igc->igc_dip, DDI_INTR_TYPE_MSIX,
853 &navail)) != DDI_SUCCESS) {
854 dev_err(igc->igc_dip, CE_WARN, "failed to get number of "
855 "available MSI-X interrupts: %d", ret);
856 return (false);
857 }
858
859 if (navail < min_nintrs) {
860 dev_err(igc->igc_dip, CE_WARN, "igc driver currently requires "
861 "%d MSI-X interrupts be available, found %d", min_nintrs,
862 navail);
863 return (false);
864 }
865
866 /*
867 * In the future this could be based upon the multiple queues that the
868 * device supports, but for now it's limited to two. See 'Rings and
869 * Interrupts' in the theory statement for more background.
870 */
871 req = min_nintrs;
872 req = MIN(req, navail);
873 igc->igc_intr_size = req * sizeof (ddi_intr_handle_t);
874 igc->igc_intr_handles = kmem_alloc(igc->igc_intr_size, KM_SLEEP);
875
876 if ((ret = ddi_intr_alloc(igc->igc_dip, igc->igc_intr_handles,
877 DDI_INTR_TYPE_MSIX, 0, req, &igc->igc_nintrs,
878 DDI_INTR_ALLOC_NORMAL)) != DDI_SUCCESS) {
879 dev_err(igc->igc_dip, CE_WARN, "failed to allocate interrupts: "
880 "%d", ret);
881 return (false);
882 }
883
884 igc->igc_intr_type = DDI_INTR_TYPE_MSIX;
885 igc->igc_attach |= IGC_ATTACH_INTR_ALLOC;
886 if (igc->igc_nintrs < min_nintrs) {
887 dev_err(igc->igc_dip, CE_WARN, "received %d interrupts, but "
888 "needed at least %d", igc->igc_nintrs, min_nintrs);
889 return (false);
890 }
891
892 if ((ret = ddi_intr_get_pri(igc->igc_intr_handles[0],
893 &igc->igc_intr_pri)) != DDI_SUCCESS) {
894 dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
895 "priority: %d", ret);
896 return (false);
897 }
898
899 if ((ret = ddi_intr_get_cap(igc->igc_intr_handles[0],
900 &igc->igc_intr_cap)) != DDI_SUCCESS) {
901 dev_err(igc->igc_dip, CE_WARN, "failed to get interrupt "
902 "capabilities: %d", ret);
903 return (false);
904 }
905
906 return (true);
907 }
908
909 /*
910 * As part of allocating our rings we make the following assumptions about
911 * interrupt assignments. All tx rings share interrupt 0. All rx rings have
912 * separate interrupts starting from interrupt 1. This design may likely change
913 * in the face of actual multi-ring support
914 */
915 static bool
igc_rings_alloc(igc_t * igc)916 igc_rings_alloc(igc_t *igc)
917 {
918 uint32_t intr = 0;
919 igc->igc_tx_rings = kmem_zalloc(sizeof (igc_tx_ring_t) *
920 igc->igc_ntx_rings, KM_SLEEP);
921
922 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
923 igc->igc_tx_rings[i].itr_igc = igc;
924 igc->igc_tx_rings[i].itr_idx = i;
925 igc->igc_tx_rings[i].itr_intr_idx = intr;
926 mutex_init(&igc->igc_tx_rings[i].itr_lock, NULL, MUTEX_DRIVER,
927 DDI_INTR_PRI(igc->igc_intr_pri));
928 if (!igc_tx_ring_stats_init(igc, &igc->igc_tx_rings[i])) {
929 return (false);
930 }
931 }
932
933 igc->igc_rx_rings = kmem_zalloc(sizeof (igc_rx_ring_t) *
934 igc->igc_nrx_rings, KM_SLEEP);
935 intr = 1;
936
937 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++, intr++) {
938 igc->igc_rx_rings[i].irr_igc = igc;
939 igc->igc_rx_rings[i].irr_idx = i;
940 igc->igc_rx_rings[i].irr_intr_idx = intr;
941 mutex_init(&igc->igc_rx_rings[i].irr_lock, NULL, MUTEX_DRIVER,
942 DDI_INTR_PRI(igc->igc_intr_pri));
943 mutex_init(&igc->igc_rx_rings[i].irr_free_lock, NULL,
944 MUTEX_DRIVER, DDI_INTR_PRI(igc->igc_intr_pri));
945 cv_init(&igc->igc_rx_rings[i].irr_free_cv, NULL, CV_DRIVER,
946 NULL);
947 if (!igc_rx_ring_stats_init(igc, &igc->igc_rx_rings[i])) {
948 return (false);
949 }
950 }
951
952 ASSERT3U(intr, ==, igc->igc_nintrs);
953
954 return (true);
955 }
956
957 /*
958 * Allocate our interrupts. Note, we have more or less constrained the device
959 * right now to only request two interrupts which we use in a fixed way. If we
960 * end up with more varied queue support then this should be changed around.
961 */
962 static bool
igc_intr_hdlr_init(igc_t * igc)963 igc_intr_hdlr_init(igc_t *igc)
964 {
965 int ret;
966
967 if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[0],
968 igc_intr_tx_other, igc, NULL)) != DDI_SUCCESS) {
969 dev_err(igc->igc_dip, CE_WARN, "failed to add tx/other "
970 "interrupt handler: %d", ret);
971 return (false);
972 }
973
974 if ((ret = ddi_intr_add_handler(igc->igc_intr_handles[1],
975 igc_intr_rx_queue, igc, (uintptr_t)0)) != DDI_SUCCESS) {
976 dev_err(igc->igc_dip, CE_WARN, "failed to add rx interrupt "
977 "handler: %d", ret);
978 if ((ret = ddi_intr_remove_handler(igc->igc_intr_handles[0])) !=
979 DDI_SUCCESS) {
980 dev_err(igc->igc_dip, CE_WARN, "failed to remove "
981 "tx/other interrupt handler");
982 }
983 return (false);
984 }
985
986 return (true);
987 }
988
989 static void
igc_hw_control(igc_t * igc,bool take)990 igc_hw_control(igc_t *igc, bool take)
991 {
992 uint32_t ctrl = igc_read32(igc, IGC_CTRL_EXT);
993
994 if (take) {
995 ctrl |= IGC_CTRL_EXT_DRV_LOAD;
996 } else {
997 ctrl &= ~IGC_CTRL_EXT_DRV_LOAD;
998 }
999
1000 igc_write32(igc, IGC_CTRL_EXT, ctrl);
1001 }
1002
1003 /*
1004 * Basic device initialization and sanity check. This covers that we can
1005 * properly reset the device, validate its checksum, and get a valid MAC
1006 * address.
1007 */
1008 static bool
igc_hw_init(igc_t * igc)1009 igc_hw_init(igc_t *igc)
1010 {
1011 int ret;
1012 uint32_t eecd;
1013
1014 if ((ret = igc_reset_hw(&igc->igc_hw)) != IGC_SUCCESS) {
1015 dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
1016 ret);
1017 return (false);
1018 }
1019
1020 /*
1021 * Goodbye firmware.
1022 */
1023 igc_hw_control(igc, true);
1024
1025 /*
1026 * Check the NVM validiity if a device is present.
1027 */
1028 eecd = igc_read32(igc, IGC_EECD);
1029 if ((eecd & IGC_EECD_EE_DET) != 0) {
1030 if ((ret = igc_validate_nvm_checksum(&igc->igc_hw)) !=
1031 IGC_SUCCESS) {
1032 dev_err(igc->igc_dip, CE_WARN, "failed to validate "
1033 "igc NVM checksum: %d", ret);
1034 return (false);
1035 }
1036 }
1037
1038 if ((ret = igc_read_mac_addr(&igc->igc_hw)) != IGC_SUCCESS) {
1039 dev_err(igc->igc_dip, CE_WARN, "failed to read MAC address: %d",
1040 ret);
1041 return (false);
1042 }
1043
1044 if ((ret = igc_get_phy_id(&igc->igc_hw)) != IGC_SUCCESS) {
1045 dev_err(igc->igc_dip, CE_WARN, "failed to get PHY id: %d", ret);
1046 return (false);
1047 }
1048
1049 return (true);
1050 }
1051
1052 /*
1053 * In case the user has modified the LED state through MAC_CAPAB_LED, restore
1054 * that back to the defaults we got when we started up the device.
1055 */
1056 static void
igc_led_fini(igc_t * igc)1057 igc_led_fini(igc_t *igc)
1058 {
1059 igc_write32(igc, IGC_LEDCTL, igc->igc_ledctl);
1060 }
1061
1062 /*
1063 * Traditionally the Intel NIC drivers avoid touching activity pins as part of
1064 * their behavior for what we use. We also don't touch a pin if it's in SDP mode
1065 * and not being used to drive an LED as it means it's likely not for us.
1066 */
1067 static bool
igc_led_ignore(i225_led_mode_t mode)1068 igc_led_ignore(i225_led_mode_t mode)
1069 {
1070 switch (mode) {
1071 case I225_LED_M_FILTER_ACT:
1072 case I225_LED_M_LINK_ACT:
1073 case I225_LED_M_SDP:
1074 case I225_LED_M_PAUSE:
1075 case I225_LED_M_ACT:
1076 return (true);
1077 default:
1078 return (false);
1079 }
1080 }
1081
1082 static inline uint32_t
igc_led_bitoff(uint32_t led)1083 igc_led_bitoff(uint32_t led)
1084 {
1085 VERIFY3U(led, <, 3);
1086 return (led * 8);
1087 }
1088
1089 static inline uint32_t
igc_led_get_mode(uint32_t led,uint32_t reg)1090 igc_led_get_mode(uint32_t led, uint32_t reg)
1091 {
1092 uint32_t off = igc_led_bitoff(led);
1093 return (bitx32(reg, 3 + off, off));
1094 }
1095
1096 static inline uint32_t
igc_led_set_mode(uint32_t led,uint32_t reg,i225_led_mode_t mode)1097 igc_led_set_mode(uint32_t led, uint32_t reg, i225_led_mode_t mode)
1098 {
1099 uint32_t off = igc_led_bitoff(led);
1100 return (bitset32(reg, 3 + off, off, mode));
1101 }
1102
1103 static inline uint32_t
igc_led_get_ivrt(uint32_t led,uint32_t reg)1104 igc_led_get_ivrt(uint32_t led, uint32_t reg)
1105 {
1106 uint32_t off = igc_led_bitoff(led) + 6;
1107 return (bitx32(reg, off, off));
1108 }
1109
1110 static inline uint32_t
igc_led_set_blink(uint32_t led,uint32_t reg,bool en)1111 igc_led_set_blink(uint32_t led, uint32_t reg, bool en)
1112 {
1113 uint32_t off = igc_led_bitoff(led) + 7;
1114 return (bitset32(reg, off, off, en));
1115 }
1116
1117 /*
1118 * There are three LEDs on the chip. The reference defines LED0 for 1 GbE link
1119 * up, LED1 for a 2.5GbE link up, and LED 2 for activity. However, this is all
1120 * controllable in the NVM so we shouldn't assume that these have any of their
1121 * default values. We instead read the LEDCTL register to see how it was set up
1122 * by default (though the NVM would likely be better). We then create pre-canned
1123 * LEDCTL register values for on, off, and default. See igc_osdep.h for some of
1124 * the caveats in definitions here. Note, we only tweak the non-activity LEDs
1125 * and if an LED has been indicated that it's being used for SDP, we don't touch
1126 * it.
1127 */
1128 static void
igc_led_init(igc_t * igc)1129 igc_led_init(igc_t *igc)
1130 {
1131 uint32_t led = igc_read32(igc, IGC_LEDCTL);
1132
1133 igc->igc_ledctl = led;
1134 igc->igc_ledctl_on = led;
1135 igc->igc_ledctl_off = led;
1136 igc->igc_ledctl_blink = led;
1137
1138 for (uint32_t i = 0; i < IGC_I225_NLEDS; i++) {
1139 i225_led_mode_t mode = igc_led_get_mode(i, led);
1140 if (!igc_led_ignore(mode)) {
1141 /*
1142 * If the inversion logic is on, that changes what the
1143 * on and off modes mean, so we need to change how we
1144 * set that appropriately.
1145 */
1146 if (igc_led_get_ivrt(i, led) != 0) {
1147 igc->igc_ledctl_on = igc_led_set_mode(i,
1148 igc->igc_ledctl_on, I225_LED_M_OFF);
1149 igc->igc_ledctl_off = igc_led_set_mode(i,
1150 igc->igc_ledctl_off, I225_LED_M_ON);
1151 igc->igc_ledctl_blink = igc_led_set_mode(i,
1152 igc->igc_ledctl_blink, I225_LED_M_OFF);
1153 } else {
1154 igc->igc_ledctl_on = igc_led_set_mode(i,
1155 igc->igc_ledctl_on, I225_LED_M_ON);
1156 igc->igc_ledctl_off = igc_led_set_mode(i,
1157 igc->igc_ledctl_off, I225_LED_M_OFF);
1158 igc->igc_ledctl_blink = igc_led_set_mode(i,
1159 igc->igc_ledctl_blink, I225_LED_M_ON);
1160 }
1161 }
1162
1163 igc->igc_ledctl_blink = igc_led_set_blink(i,
1164 igc->igc_ledctl_blink, true);
1165 }
1166
1167 igc->igc_led_mode = MAC_LED_DEFAULT;
1168 }
1169
1170 static void
igc_write_ivar(igc_t * igc,uint32_t queue,bool rx,uint32_t msix)1171 igc_write_ivar(igc_t *igc, uint32_t queue, bool rx, uint32_t msix)
1172 {
1173 const uint32_t ivarno = queue >> 1;
1174 const uint32_t reg = IGC_IVAR0 + ivarno * 4;
1175 const uint32_t val = msix | IGC_IVAR_VALID;
1176 uint32_t bitoff, bitend, ivar;
1177
1178 if (rx) {
1179 if ((queue % 2) == 0) {
1180 bitoff = IGC_IVAR_RX0_START;
1181 } else {
1182 bitoff = IGC_IVAR_RX1_START;
1183 }
1184 } else {
1185 if ((queue % 2) == 0) {
1186 bitoff = IGC_IVAR_TX0_START;
1187 } else {
1188 bitoff = IGC_IVAR_TX1_START;
1189 }
1190 }
1191 bitend = bitoff + IGC_IVAR_ENT_LEN - 1;
1192
1193 ivar = igc_read32(igc, reg);
1194 ivar = bitset32(ivar, bitend, bitoff, val);
1195 igc_write32(igc, reg, ivar);
1196 igc->igc_eims |= 1 << msix;
1197 }
1198
1199 /*
1200 * Here we need to go through and initialize the hardware's notion of how
1201 * interrupts are mapped to causes. The device must be specifically enabled for
1202 * MSI-X and then this is also where we go ensure that all of our interrupt
1203 * coalescing is properly enabled. Note, we must first touch the GPIE register
1204 * to enable MSI-X settings otherwise later settings won't do anything.
1205 */
1206 static void
igc_hw_intr_init(igc_t * igc)1207 igc_hw_intr_init(igc_t *igc)
1208 {
1209 uint32_t gpie, ivar;
1210
1211 gpie = IGC_GPIE_NSICR | IGC_GPIE_MSIX_MODE | IGC_GPIE_EIAME |
1212 IGC_GPIE_PBA;
1213 igc_write32(igc, IGC_GPIE, gpie);
1214
1215 /*
1216 * Other causes are always explicitly mapped to cause 0. Each ring then
1217 * has its own mapping. In the MISC IVAR, these start at bit 8. We leave
1218 * the '0 |' out below just to avoid a compiler complaining. We also
1219 * must unamsk this interrupt cause, which is in bit 0.
1220 */
1221 ivar = IGC_IVAR_VALID << 8;
1222 igc_write32(igc, IGC_IVAR_MISC, ivar);
1223 igc->igc_eims = 1;
1224
1225 /*
1226 * There are a few IVAR registers available in hardware. Each IVAR
1227 * register handles mapping a given queue to an MSI-X. Each IVAR handles
1228 * two queues.
1229 */
1230 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
1231 igc_write_ivar(igc, i, false,
1232 igc->igc_tx_rings[i].itr_intr_idx);
1233 }
1234
1235 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
1236 igc_write_ivar(igc, i, true, igc->igc_rx_rings[i].irr_intr_idx);
1237 }
1238
1239 for (uint32_t i = 0; i < igc->igc_nintrs; i++) {
1240 igc_write32(igc, IGC_EITR(i), igc->igc_eitr);
1241 }
1242 }
1243
1244 /*
1245 * Synchronize our sense of the unicast table over to the device. If this is the
1246 * first time that we're here due to attach, we need to go through and allocate
1247 * the tracking table.
1248 */
1249 static void
igc_unicast_sync(igc_t * igc)1250 igc_unicast_sync(igc_t *igc)
1251 {
1252 ASSERT(MUTEX_HELD(&igc->igc_lock));
1253
1254 if (igc->igc_ucast == NULL) {
1255 igc->igc_nucast = igc->igc_hw.mac.rar_entry_count;
1256 igc->igc_ucast = kmem_zalloc(sizeof (igc_addr_t) *
1257 igc->igc_nucast, KM_SLEEP);
1258 }
1259
1260 for (uint16_t i = 0; i < igc->igc_nucast; i++) {
1261 int ret = igc_rar_set(&igc->igc_hw, igc->igc_ucast[i].ia_mac,
1262 i);
1263 /*
1264 * Common code today guarantees this can't fail. Put this here
1265 * to ensure to guard against future updates.
1266 */
1267 VERIFY3S(ret, ==, IGC_SUCCESS);
1268 }
1269
1270 }
1271
1272 /*
1273 * The core code interface to the multicast table requires us to give them a
1274 * packed uint8_t array that they manually walk through in ETHERADDRL (6 byte)
1275 * chunks. This must be packed. To deal with this we opt to preserve a normal
1276 * list of multicast addresses and then a secondary version that's serialized as
1277 * the core code wants it. We allocate the memory for this secondary version at
1278 * the start.
1279 */
1280 void
igc_multicast_sync(igc_t * igc)1281 igc_multicast_sync(igc_t *igc)
1282 {
1283 uint16_t nvalid;
1284
1285 ASSERT(MUTEX_HELD(&igc->igc_lock));
1286
1287 if (igc->igc_mcast == NULL) {
1288 igc->igc_nmcast = igc->igc_hw.mac.mta_reg_count;
1289 igc->igc_mcast = kmem_zalloc(sizeof (igc_addr_t) *
1290 igc->igc_nmcast, KM_SLEEP);
1291 igc->igc_mcast_raw = kmem_alloc(sizeof (ether_addr_t) *
1292 igc->igc_nmcast, KM_SLEEP);
1293 }
1294
1295 bzero(igc->igc_mcast_raw, sizeof (ether_addr_t) * igc->igc_nmcast);
1296 nvalid = 0;
1297 for (uint16_t i = 0; i < igc->igc_nmcast; i++) {
1298 ether_addr_t *targ = &igc->igc_mcast_raw[nvalid];
1299
1300 if (!igc->igc_mcast[i].ia_valid)
1301 continue;
1302 bcopy(igc->igc_mcast[i].ia_mac, targ, sizeof (ether_addr_t));
1303 nvalid++;
1304 }
1305
1306 igc_update_mc_addr_list(&igc->igc_hw, (uint8_t *)igc->igc_mcast_raw,
1307 nvalid);
1308 }
1309
1310 /*
1311 * This function is used to reinitialize the PBA, our various flow control
1312 * settings, reset hardware, ensure that the EEE, DPLU, and related power modes
1313 * are in the correct state.
1314 */
1315 bool
igc_hw_common_init(igc_t * igc)1316 igc_hw_common_init(igc_t *igc)
1317 {
1318 int ret;
1319 uint32_t pba, hwm, hwmp, hwm2x;
1320 struct igc_hw *hw = &igc->igc_hw;
1321
1322 /*
1323 * The PBA register determines which portion is used for the receive
1324 * buffers and which is used for the transmit buffers. This follows from
1325 * the I210 and reference drivers which use 34K as the default. We
1326 * currently leave the RXPBS and TXPBS at their power-on-reset defaults.
1327 *
1328 * We set the watermark based settings similar to igb, ensuring that we
1329 * have 16-byte granularity. The general guidelines from there was that
1330 * when it comes to automatic Ethernet PAUSE frame generation we should:
1331 *
1332 * - After an XOFF, you want to receive at least two frames. We use
1333 * whichever is smaller of 9/10ths and two frames.
1334 * - The low water mark apparently wants to be closer to the high water
1335 * mark.
1336 *
1337 * See igb_init_adapter() for more information. We basically use the
1338 * same calculation it did, given that the MAC is basically the same.
1339 */
1340 pba = IGC_PBA_34K;
1341 hwmp = (pba << 10) * 9 / 10;
1342 hwm2x = (pba << 10) - 2 * igc->igc_max_frame;
1343 hwm = MIN(hwmp, hwm2x);
1344
1345 hw->fc.high_water = hwm & 0xfffffff0;
1346 hw->fc.low_water = igc->igc_hw.fc.high_water - 16;
1347
1348 /*
1349 * Use the suggested default pause time.
1350 */
1351 hw->fc.pause_time = IGC_FC_PAUSE_TIME;
1352 hw->fc.send_xon = true;
1353
1354 if ((ret = igc_reset_hw(hw)) != IGC_SUCCESS) {
1355 dev_err(igc->igc_dip, CE_WARN, "failed to reset device: %d",
1356 ret);
1357 return (false);
1358 }
1359
1360 if ((ret = igc_init_hw(hw)) != IGC_SUCCESS) {
1361 dev_err(igc->igc_dip, CE_WARN, "failed to init hardware: %d",
1362 ret);
1363 return (false);
1364 }
1365
1366 /*
1367 * Clear wake on LAN and set other power states. In addition, disable
1368 * EEE for now.
1369 */
1370 igc_write32(igc, IGC_WUC, 0);
1371
1372 if ((ret = igc_set_d0_lplu_state(hw, false)) != IGC_SUCCESS) {
1373 dev_err(igc->igc_dip, CE_WARN, "failed to set D0 LPLU mode: %d",
1374 ret);
1375 return (false);
1376 }
1377
1378 /*
1379 * There have been reports that enabling EEE for some 2.5G devices has
1380 * led to issues with the I225/226. It's not entirely clear, but we
1381 * default to disabling this like in igb/e1000g for now.
1382 */
1383 if ((ret = igc_set_eee_i225(hw, false, false, false)) != IGC_SUCCESS) {
1384 dev_err(igc->igc_dip, CE_WARN, "failed to set EEE mode: %d",
1385 ret);
1386 return (false);
1387 }
1388
1389 igc_hw_intr_init(igc);
1390
1391 mutex_enter(&igc->igc_lock);
1392 igc_unicast_sync(igc);
1393 igc_multicast_sync(igc);
1394
1395 igc->igc_hw.mac.get_link_status = true;
1396 (void) igc_get_phy_info(hw);
1397 (void) igc_check_for_link(hw);
1398 mutex_exit(&igc->igc_lock);
1399
1400 return (true);
1401 }
1402
1403 static bool
igc_intr_en(igc_t * igc)1404 igc_intr_en(igc_t *igc)
1405 {
1406 int ret;
1407
1408 if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
1409 ret = ddi_intr_block_enable(igc->igc_intr_handles,
1410 igc->igc_nintrs);
1411 if (ret != DDI_SUCCESS) {
1412 dev_err(igc->igc_dip, CE_WARN, "failed to block "
1413 "enable interrupts: %d", ret);
1414 return (false);
1415 }
1416 } else {
1417 for (int i = 0; i < igc->igc_nintrs; i++) {
1418 ret = ddi_intr_enable(igc->igc_intr_handles[i]);
1419 if (ret != DDI_SUCCESS) {
1420 dev_err(igc->igc_dip, CE_WARN, "failed to "
1421 "enable interrupt %d: %d", i, ret);
1422 for (int clean = 0; clean < i; clean++) {
1423 ret = ddi_intr_disable(
1424 igc->igc_intr_handles[clean]);
1425 if (ret != DDI_SUCCESS) {
1426 dev_err(igc->igc_dip, CE_WARN,
1427 "failed to disable "
1428 "interrupt %d while "
1429 "unwinding: %d", i, ret);
1430 }
1431 }
1432 return (false);
1433 }
1434 }
1435 }
1436
1437 /*
1438 * Now that we've enabled interrupts here, clear any pending interrupts
1439 * and make sure hardware interrupts are enabled.
1440 */
1441 (void) igc_read32(igc, IGC_ICR);
1442
1443 return (true);
1444 }
1445
1446 /*
1447 * Undo interrupt enablement.
1448 */
1449 void
igc_hw_intr_disable(igc_t * igc)1450 igc_hw_intr_disable(igc_t *igc)
1451 {
1452 igc_write32(igc, IGC_EIMC, UINT32_MAX);
1453 igc_write32(igc, IGC_EIAC, 0);
1454 igc_write32(igc, IGC_IMC, UINT32_MAX);
1455 }
1456
1457 /*
1458 * This is used during the GLDv3 mc_start(9E) entry point to enable interrupts
1459 * on the device itself.
1460 */
1461 void
igc_hw_intr_enable(igc_t * igc)1462 igc_hw_intr_enable(igc_t *igc)
1463 {
1464 uint32_t ims;
1465
1466 /*
1467 * First we clear pending interrupts.
1468 */
1469 (void) igc_read32(igc, IGC_ICR);
1470
1471 /*
1472 * The hardware has extended and non-extended interrupt masks and
1473 * auto-clear registers. We always disable auto-clear for the
1474 * non-extended portions. See the I210 datasheet 'Setting Interrupt
1475 * Registers' for a better sense of what's going on here.
1476 *
1477 * In the IMS register we always register link status change events and
1478 * device reset assertions.
1479 */
1480 ims = IGC_IMS_LSC | IGC_IMS_DRSTA;
1481
1482 igc_write32(igc, IGC_EIAC, igc->igc_eims);
1483 igc_write32(igc, IGC_EIMS, igc->igc_eims);
1484 igc_write32(igc, IGC_IMS, ims);
1485 igc_write32(igc, IGC_IAM, 0);
1486 }
1487
1488 static void
igc_cleanup(igc_t * igc)1489 igc_cleanup(igc_t *igc)
1490 {
1491 if (igc->igc_mcast != NULL) {
1492 ASSERT3U(igc->igc_nmcast, !=, 0);
1493 kmem_free(igc->igc_mcast_raw, sizeof (ether_addr_t) *
1494 igc->igc_nmcast);
1495 kmem_free(igc->igc_mcast, sizeof (igc_addr_t) *
1496 igc->igc_nmcast);
1497 igc->igc_nmcast = 0;
1498 igc->igc_mcast = NULL;
1499 }
1500
1501 if (igc->igc_ucast != NULL) {
1502 ASSERT3U(igc->igc_nucast, !=, 0);
1503 kmem_free(igc->igc_ucast, sizeof (igc_addr_t) *
1504 igc->igc_nucast);
1505 igc->igc_nucast = 0;
1506 igc->igc_ucast = NULL;
1507 }
1508
1509 if ((igc->igc_attach & IGC_ATTACH_INTR_EN) != 0) {
1510 int ret;
1511 if ((igc->igc_intr_cap & DDI_INTR_FLAG_BLOCK) != 0) {
1512 ret = ddi_intr_block_disable(igc->igc_intr_handles,
1513 igc->igc_nintrs);
1514 if (ret != DDI_SUCCESS) {
1515 dev_err(igc->igc_dip, CE_WARN, "failed to "
1516 "block disable interrupts: %d", ret);
1517 }
1518 } else {
1519 for (int i = 0; i < igc->igc_nintrs; i++) {
1520 ret = ddi_intr_disable(
1521 igc->igc_intr_handles[i]);
1522 if (ret != DDI_SUCCESS) {
1523 dev_err(igc->igc_dip, CE_WARN, "failed "
1524 "to disable interrupt %d: %d", i,
1525 ret);
1526 }
1527 }
1528 }
1529 igc->igc_attach &= ~IGC_ATTACH_INTR_EN;
1530 }
1531
1532 if ((igc->igc_attach & IGC_ATTACH_MAC) != 0) {
1533 int ret = mac_unregister(igc->igc_mac_hdl);
1534 if (ret != 0) {
1535 dev_err(igc->igc_dip, CE_WARN, "failed to unregister "
1536 "MAC handle: %d", ret);
1537 }
1538 igc->igc_attach &= ~IGC_ATTACH_MAC;
1539 }
1540
1541 if ((igc->igc_attach & IGC_ATTACH_STATS) != 0) {
1542 igc_stats_fini(igc);
1543 igc->igc_attach &= ~IGC_ATTACH_STATS;
1544 }
1545
1546 if ((igc->igc_attach & IGC_ATTACH_LED) != 0) {
1547 igc_led_fini(igc);
1548 igc->igc_attach &= ~IGC_ATTACH_LED;
1549 }
1550
1551 if ((igc->igc_attach & IGC_ATTACH_INTR_HANDLER) != 0) {
1552 for (int i = 0; i < igc->igc_nintrs; i++) {
1553 int ret =
1554 ddi_intr_remove_handler(igc->igc_intr_handles[i]);
1555 if (ret != 0) {
1556 dev_err(igc->igc_dip, CE_WARN, "failed to "
1557 "remove interrupt %d handler: %d", i, ret);
1558 }
1559 }
1560 igc->igc_attach &= ~IGC_ATTACH_INTR_HANDLER;
1561 }
1562
1563 if (igc->igc_tx_rings != NULL) {
1564 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
1565 igc_tx_ring_stats_fini(&igc->igc_tx_rings[i]);
1566 mutex_destroy(&igc->igc_tx_rings[i].itr_lock);
1567 }
1568 kmem_free(igc->igc_tx_rings, sizeof (igc_tx_ring_t) *
1569 igc->igc_ntx_rings);
1570 igc->igc_tx_rings = NULL;
1571 }
1572
1573 if (igc->igc_rx_rings != NULL) {
1574 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
1575 igc_rx_ring_stats_fini(&igc->igc_rx_rings[i]);
1576 cv_destroy(&igc->igc_rx_rings[i].irr_free_cv);
1577 mutex_destroy(&igc->igc_rx_rings[i].irr_free_lock);
1578 mutex_destroy(&igc->igc_rx_rings[i].irr_lock);
1579 }
1580 kmem_free(igc->igc_rx_rings, sizeof (igc_rx_ring_t) *
1581 igc->igc_nrx_rings);
1582 igc->igc_rx_rings = NULL;
1583 }
1584
1585 if ((igc->igc_attach & IGC_ATTACH_MUTEX) != 0) {
1586 mutex_destroy(&igc->igc_lock);
1587 igc->igc_attach &= ~IGC_ATTACH_MUTEX;
1588 }
1589
1590 if ((igc->igc_attach & IGC_ATTACH_INTR_ALLOC) != 0) {
1591 for (int i = 0; i < igc->igc_nintrs; i++) {
1592 int ret = ddi_intr_free(igc->igc_intr_handles[i]);
1593 if (ret != DDI_SUCCESS) {
1594 dev_err(igc->igc_dip, CE_WARN, "unexpected "
1595 "failure freeing interrupt %d: %d", i, ret);
1596 }
1597 }
1598 igc->igc_attach &= ~IGC_ATTACH_INTR_ALLOC;
1599 }
1600
1601 if (igc->igc_intr_handles != NULL) {
1602 ASSERT3U(igc->igc_intr_size, !=, 0);
1603 kmem_free(igc->igc_intr_handles, igc->igc_intr_size);
1604 }
1605
1606 /*
1607 * Now that we're almost done, begrudgingly let firmware know we're
1608 * done.
1609 */
1610 igc_hw_control(igc, false);
1611
1612 if (igc->igc_regs_hdl != NULL) {
1613 ddi_regs_map_free(&igc->igc_regs_hdl);
1614 igc->igc_regs_base = NULL;
1615 }
1616
1617 if (igc->igc_cfgspace != NULL) {
1618 pci_config_teardown(&igc->igc_cfgspace);
1619 }
1620 igc->igc_attach &= ~IGC_ATTACH_REGS;
1621
1622 ddi_set_driver_private(igc->igc_dip, NULL);
1623 igc->igc_dip = NULL;
1624
1625 VERIFY0(igc->igc_attach);
1626
1627 kmem_free(igc, sizeof (igc_t));
1628 }
1629
1630 static int
igc_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1631 igc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1632 {
1633 igc_t *igc;
1634
1635 if (cmd != DDI_ATTACH) {
1636 return (DDI_FAILURE);
1637 }
1638
1639 igc = kmem_zalloc(sizeof (igc_t), KM_SLEEP);
1640 ddi_set_driver_private(dip, igc);
1641 igc->igc_dip = dip;
1642
1643 /*
1644 * Initialize a few members that are not zero-based.
1645 */
1646 igc->igc_link_duplex = LINK_DUPLEX_UNKNOWN;
1647 igc->igc_link_state = LINK_STATE_UNKNOWN;
1648
1649 /*
1650 * Set up all the register spaces that hardware requires.
1651 */
1652 if (!igc_setup_regs(igc)) {
1653 goto err;
1654 }
1655 igc->igc_attach |= IGC_ATTACH_REGS;
1656
1657 /*
1658 * Setup the common code.
1659 */
1660 if (!igc_core_code_init(igc)) {
1661 goto err;
1662 }
1663
1664 if (!igc_limits_init(igc)) {
1665 goto err;
1666 }
1667
1668 /*
1669 * Go allocate and set up all of our interrupts.
1670 */
1671 if (!igc_intr_init(igc)) {
1672 goto err;
1673 }
1674
1675 /*
1676 * Initialize our main mutex for the device now that we have an
1677 * interrupt priority.
1678 */
1679 mutex_init(&igc->igc_lock, NULL, MUTEX_DRIVER,
1680 DDI_INTR_PRI(igc->igc_intr_pri));
1681 igc->igc_attach |= IGC_ATTACH_MUTEX;
1682
1683 /*
1684 * We now want to determine the total number of rx and tx rings that we
1685 * have based on our interrupt allocation so we can go through and
1686 * perform the rest of the device setup that is required. The various
1687 * queues that we have are mapped to a given MSI-X through the IVAR
1688 * registers in the device. There is also an IVAR_MISC register that
1689 * maps link state change events and other issues up to two vectors.
1690 *
1691 * There isn't strictly per-queue interrupt generation control. Instead,
1692 * when in MSI-X mode, the device has an extended interrupt cause and
1693 * mask register. The mask register allows us to mask the five bits
1694 * described above.
1695 *
1696 * Because of all this we end up limiting the number of queues that we
1697 * use to 2 for now: 1 for tx and 1 for rx. Interrupt 0 is for tx/other
1698 * and 1 for rx.
1699 */
1700 igc->igc_nrx_rings = 1;
1701 igc->igc_ntx_rings = 1;
1702
1703 /*
1704 * Default to a 1500 byte MTU.
1705 */
1706 igc->igc_mtu = ETHERMTU;
1707 igc_hw_buf_update(igc);
1708
1709 /*
1710 * Initialize default descriptor limits and thresholds. We allocate 1.5
1711 * times the number of rx descriptors so that way we can loan up to
1712 * 1/3rd of them. We allocate an even number of tx descriptors.
1713 */
1714 igc->igc_rx_ndesc = IGC_DEF_RX_RING_SIZE;
1715 igc->igc_tx_ndesc = IGC_DEF_TX_RING_SIZE;
1716 igc->igc_rx_nbuf = igc->igc_rx_ndesc + (igc->igc_rx_ndesc >> 1);
1717 igc->igc_tx_nbuf = igc->igc_tx_ndesc;
1718 igc->igc_rx_nfree = igc->igc_rx_nbuf - igc->igc_rx_ndesc;
1719 igc->igc_rx_intr_nframes = IGC_DEF_RX_RING_INTR_LIMIT;
1720 igc->igc_rx_bind_thresh = IGC_DEF_RX_BIND;
1721 igc->igc_tx_bind_thresh = IGC_DEF_TX_BIND;
1722 igc->igc_tx_notify_thresh = IGC_DEF_TX_NOTIFY_MIN;
1723 igc->igc_tx_recycle_thresh = IGC_DEF_TX_RECYCLE_MIN;
1724 igc->igc_tx_gap = IGC_DEF_TX_GAP;
1725 igc->igc_eitr = IGC_DEF_EITR;
1726
1727 if (!igc_rings_alloc(igc)) {
1728 goto err;
1729 }
1730
1731 if (!igc_intr_hdlr_init(igc)) {
1732 goto err;
1733 }
1734 igc->igc_attach |= IGC_ATTACH_INTR_HANDLER;
1735
1736 /*
1737 * Next reset the device before we begin initializing anything else. As
1738 * part of this, validate the flash checksum if present. This is all
1739 * initialization that we would only do once per device. Other
1740 * initialization that we want to do after any reset is done is
1741 * igc_hw_common_init().
1742 */
1743 if (!igc_hw_init(igc)) {
1744 goto err;
1745 }
1746
1747 igc_led_init(igc);
1748 igc->igc_attach |= IGC_ATTACH_LED;
1749
1750 /*
1751 * Snapshot our basic settings that users can eventually control in the
1752 * device. We start with always enabling auto-negotiation and
1753 * advertising the basic supported speeds. The I225v1 does have
1754 * substantial problems with enabling 2.5G due to the fact that it
1755 * doesn't maintain a proper inter-packet gap. Despite that, we default
1756 * to enabling 2.5G for now as its supposedly not broken with all link
1757 * partners and the NVM. We also don't have a way of actually
1758 * identifying and mapping that to something in the driver today,
1759 * unfortunately.
1760 */
1761 igc->igc_hw.mac.autoneg = true;
1762 igc->igc_hw.phy.autoneg_wait_to_complete = false;
1763 igc->igc_hw.phy.autoneg_advertised = IGC_DEFAULT_ADV;
1764 igc->igc_hw.fc.requested_mode = igc_fc_default;
1765 igc->igc_hw.fc.current_mode = igc_fc_default;
1766
1767 if (!igc_hw_common_init(igc)) {
1768 goto err;
1769 }
1770
1771 if (!igc_stats_init(igc)) {
1772 goto err;
1773 }
1774 igc->igc_attach |= IGC_ATTACH_STATS;
1775
1776 /*
1777 * Register with MAC
1778 */
1779 if (!igc_mac_register(igc)) {
1780 goto err;
1781 }
1782 igc->igc_attach |= IGC_ATTACH_MAC;
1783
1784 /*
1785 * Enable interrupts and get going.
1786 */
1787 if (!igc_intr_en(igc)) {
1788 goto err;
1789 }
1790 igc->igc_attach |= IGC_ATTACH_INTR_EN;
1791
1792 return (DDI_SUCCESS);
1793
1794 err:
1795 igc_cleanup(igc);
1796 return (DDI_FAILURE);
1797 }
1798
1799 static int
igc_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)1800 igc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1801 {
1802 igc_t *igc;
1803
1804 if (cmd != DDI_DETACH) {
1805 return (DDI_FAILURE);
1806 }
1807
1808 igc = ddi_get_driver_private(dip);
1809 if (igc == NULL) {
1810 dev_err(dip, CE_WARN, "asked to detach, but missing igc_t");
1811 return (DDI_FAILURE);
1812 }
1813
1814 igc_cleanup(igc);
1815 return (DDI_SUCCESS);
1816 }
1817
1818 static struct cb_ops igc_cb_ops = {
1819 .cb_open = nulldev,
1820 .cb_close = nulldev,
1821 .cb_strategy = nodev,
1822 .cb_print = nodev,
1823 .cb_dump = nodev,
1824 .cb_read = nodev,
1825 .cb_write = nodev,
1826 .cb_ioctl = nodev,
1827 .cb_devmap = nodev,
1828 .cb_mmap = nodev,
1829 .cb_segmap = nodev,
1830 .cb_chpoll = nochpoll,
1831 .cb_prop_op = ddi_prop_op,
1832 .cb_flag = D_MP,
1833 .cb_rev = CB_REV,
1834 .cb_aread = nodev,
1835 .cb_awrite = nodev
1836 };
1837
1838 static struct dev_ops igc_dev_ops = {
1839 .devo_rev = DEVO_REV,
1840 .devo_refcnt = 0,
1841 .devo_getinfo = NULL,
1842 .devo_identify = nulldev,
1843 .devo_probe = nulldev,
1844 .devo_attach = igc_attach,
1845 .devo_detach = igc_detach,
1846 .devo_reset = nodev,
1847 .devo_quiesce = ddi_quiesce_not_supported,
1848 .devo_cb_ops = &igc_cb_ops
1849 };
1850
1851 static struct modldrv igc_modldrv = {
1852 .drv_modops = &mod_driverops,
1853 .drv_linkinfo = "Intel I226/226 Ethernet Controller",
1854 .drv_dev_ops = &igc_dev_ops
1855 };
1856
1857 static struct modlinkage igc_modlinkage = {
1858 .ml_rev = MODREV_1,
1859 .ml_linkage = { &igc_modldrv, NULL }
1860 };
1861
1862 int
_init(void)1863 _init(void)
1864 {
1865 int ret;
1866
1867 mac_init_ops(&igc_dev_ops, IGC_MOD_NAME);
1868
1869 if ((ret = mod_install(&igc_modlinkage)) != 0) {
1870 mac_fini_ops(&igc_dev_ops);
1871 }
1872
1873 return (ret);
1874 }
1875
1876 int
_info(struct modinfo * modinfop)1877 _info(struct modinfo *modinfop)
1878 {
1879 return (mod_info(&igc_modlinkage, modinfop));
1880 }
1881
1882 int
_fini(void)1883 _fini(void)
1884 {
1885 int ret;
1886
1887 if ((ret = mod_remove(&igc_modlinkage)) == 0) {
1888 mac_fini_ops(&igc_dev_ops);
1889 }
1890
1891 return (ret);
1892 }
1893