xref: /illumos-gate/usr/src/uts/common/io/ena/ena.c (revision fdd3baea1de807613d7541b2fad475760768584b)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 #include "ena_hw.h"
17 #include "ena.h"
18 
19 /*
20  * Elastic Network Adapter (ENA) Driver
21  * ------------------------------------
22  *
23  * The ena driver provides support for the AWS ENA device, also
24  * referred to as their "enhanced networking". This device is present
25  * on "Nitro"-based instances. It presents itself with the following
26  * PCI Vendor/Device IDs
27  *
28  * o 1d0f:0ec2 -- ENA PF
29  * o 1d0f:1ec2 -- ENA PF (Reserved)
30  * o 1d0f:ec20 -- ENA VF
31  * o 1d0f:ec21 -- ENA VF (Reserved)
32  *
33  * This driver provides support for only the essential features needed
34  * to drive traffic on an ENA device. Support for the following
35  * features IS NOT currently implemented.
36  *
37  *    o Admin Queue Interrupts: queue completion events are always polled
38  *    o AENQ keep alive
39  *    o FMA
40  *    o Rx checksum offloads
41  *    o Tx checksum offloads
42  *    o Tx DMA bind (borrow buffers)
43  *    o Rx DMA bind (loaned buffers)
44  *    o TSO
45  *    o RSS
46  *    o Low Latency Queues (LLQ)
47  *    o Support for different Tx completion policies
48  *    o More controlled Tx recycling and Rx refill
49  *
50  * Even without these features the ena driver should perform
51  * reasonably well.
52  *
53  * Driver vs. Hardware Types
54  * -------------------------
55  *
56  * To properly communicate with the ENA device the driver must
57  * populate memory (registers and buffers) with specific types. These
58  * types are defined by the device and are found under the "common"
59  * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have
60  * simplified this a bit by defining all device-specific types in the
61  * ena_hw.h file. Furthermore, all device-specific types are given an
62  * "enahw" prefix. This makes it clear when we are dealing with a
63  * device type and when we are dealing with a driver type.
64  *
65  * [1]: https://github.com/amzn/amzn-drivers
66  *
67  * Groups, Rings (Queues), and Interrupts
68  * --------------------------------------
69  *
70  * The ENA device presents one mac group. This single mac group
71  * represents the single unicast address that this device represents
72  * in your AWS instance. The ENA device presents no option for
73  * configuring additional MAC addresses, multicast, or promisc mode --
74  * you receive only what AWS wants you to receive.
75  *
76  * This single mac group may have one or more rings. The ENA driver
77  * refers to rings as queues, for no special reason other than it was
78  * the dominant language in the Linux and FreeBSD drivers, and it
79  * spilled over into this port. The upper bound on number of queues is
80  * presented by the device. However, we don't just go with whatever
81  * number of queues the device reports; but rather we limit the queues
82  * based on other factors such as an absolute maximum, number of
83  * online CPUs, and number of available interrupts. The upper bound is
84  * calculated by ena_set_max_io_queues(), and that is used and
85  * possibly further restricted in ena_attach_intr_alloc(). As this
86  * point, ultimately, it is the number of available interrupts (minus
87  * one for the admin queue) that determines the number of queues: one
88  * Tx and one Rx on each I/O interrupt.
89  *
90  * NOTE: Perhaps it is overly restrictive to limit the number of
91  * queues to the number of I/O interrupts. Something worth considering
92  * on larger instances if they present far less interrupts than they
93  * do queues + CPUs.
94  *
95  * The ENA device presents MSI-X interrupts only. During attach the
96  * driver queries the number of available interrupts and sets aside
97  * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N).
98  * This means that a Tx/Rx queue at index 0 will map to vector 1, and
99  * so on.
100  *
101  * NOTE: The ENA driver currently doesn't make use of the Admin Queue
102  * interrupt. This interrupt is used to notify a the driver that a
103  * command response is read. The ENA driver always polls the Admin
104  * Queue for responses.
105  *
106  * Tx Queue Workings
107  * -----------------
108  *
109  * A single Tx queue (ena_txq_t) is made up of one submission queue
110  * (SQ) and its paired completion queue (CQ). These two queues form a
111  * logical descriptor ring which is used to send packets out of the
112  * device -- where each SQ entry describes the packet to be sent
113  * (enahw_tx_desc_t) and each CQ entry describes the result of sending
114  * a packet (enahw_tx_cdesc_t). For this to work the host and device
115  * must agree on which descriptors are currently owned by the host
116  * (free for sending) and which are owned by the device (pending
117  * device completion). This state is tracked on the host side via head
118  * and tail indexes along with a phase value.
119  *
120  * The head and tail values represent the head and tail of the FIFO
121  * queue of pending packets -- the next packet to be sent by the
122  * device is head, and all descriptors up to tail are ready for
123  * sending. The phase allows the host to determine which CQ
124  * descriptors represent completed events when using per-SQ completion
125  * events (as opposed to queue head pointer updates). As the queues
126  * represent a logical ring buffer, the phase must alternate on
127  * wrap-around. The device initializes the phase to zero, and the host
128  * starts with a phase of 1. The first packet descriptor writes, and
129  * their corresponding completions, are indicated with a phase of 1.
130  *
131  *
132  * For example, the diagram below represents the SQ/CQ state after the
133  * first 6 packets have been sent by the host and 2 of them have been
134  * completed by the device (and these completions have been processed
135  * by the driver). In this state the host could send 4 more packets
136  * before needing to wait on completion events.
137  *
138  *
139  *    +---+---+---+---+---+---+---+---+
140  * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |   phase = 1
141  *    +---+---+---+---+---+---+---+---+
142  *                              ^
143  *                              |
144  *                            tail
145  *            head
146  *              |
147  *              v
148  *    +---+---+---+---+---+---+---+---+
149  * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |   phase = 1
150  *    +---+---+---+---+---+---+---+---+
151  *
152  *
153  * The next diagram shows how the state changes as 5 more packets are
154  * sent (for a total of 11) and 7 more are completed (for a total of
155  * 9). Notice that as the SQ and CQ have wrapped around their phases
156  * have been complemented. In this state the host could send 6 more
157  * packets before needing to wait on completion events.
158  *
159  *    +---+---+---+---+---+---+---+---+
160  * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |   phase = 0
161  *    +---+---+---+---+---+---+---+---+
162  *                  ^
163  *                  |
164  *                tail
165  *        head
166  *          |
167  *          v
168  *    +---+---+---+---+---+---+---+---+
169  * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |   phase = 0
170  *    +---+---+---+---+---+---+---+---+
171  *
172  *
173  * Currently, all packets are copied for Tx. At ring start we allocate
174  * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has
175  * DMA buffer associated with it; and each buffer is large enough to
176  * hold the MTU. Therefore, Tx descriptors and TCBs currently have a
177  * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to
178  * the TCB's DMA buffer, and a new descriptor is written to the SQ
179  * describing said TCB buffer. If and when we add more advanced
180  * features like DMA binding of mblks and TSO, this 1:1 guarantee will
181  * no longer hold.
182  *
183  * Rx Queue Workings
184  * -----------------
185  *
186  * In terms of implementing the logical descriptor ring, the Rx queues
187  * are very much like the Tx queues. There is a paired SQ and CQ for
188  * each logical ring. The difference is that in Rx the SQ is for
189  * handing buffers to the device to fill, and the CQ is for describing
190  * the contents of those buffers for a given received frame. At Rx
191  * ring start we allocate a Rx Control Buffer (RCB) for each
192  * descriptor in the ring. Each RCB has a DMA buffer associated with
193  * it; and each buffer is large enough to hold the MTU. For each
194  * received frame we copy the contents out of the RCB and into its own
195  * mblk, immediately returning the RCB for reuse. As with Tx, this
196  * gives us a simple 1:1 mapping currently, but if more advanced
197  * features are implemented later this could change.
198  *
199  * Asynchronous Event Notification Queue (AENQ)
200  * --------------------------------------------
201  *
202  * Each ENA device comes with a mechanism for sending out-of-band
203  * notifications to the driver. This includes events like link state
204  * changes, fatal errors, and a watchdog/keep alive signal. The AENQ
205  * delivery mechanism is via interrupt, handled by the ena_aenq_work()
206  * function, which dispatches via the eaenq_hdlrs table. If no handler
207  * is registered, the ena_aenq_default_hdlr() handler is used. A given
208  * device may not support all the different event types
209  * (enahw_aenq_groups_t); and the driver may choose to enable a subset
210  * of the supported events. During attach we call ena_setup_aenq() to
211  * negotiate the supported/enabled events. The enabled group is stored
212  * at ena_aenq_enabled_groups.
213  *
214  * Queues and Unsigned Wraparound
215  * ------------------------------
216  *
217  * All the queues use a uint16_t value as their head/tail values, e.g.
218  * the Rx queue's er_cq_head_idx value. You might notice that we only
219  * ever increment these values, letting them perform implicit unsigned
220  * integer wraparound. This is intended. This is the same behavior as
221  * the common code, and seems to be what the hardware expects. Of
222  * course, when accessing our own descriptor arrays we must make sure
223  * to first perform a modulo of this value or risk running off into
224  * space.
225  *
226  * Attach Sequencing
227  * -----------------
228  *
229  * Most drivers implement their attach/detach/cleanup functions as a
230  * sequential stream of function calls used to allocate and initialize
231  * resources in an order determined by the device's programming manual
232  * combined with any requirements imposed by the kernel and its
233  * relevant modules. These functions can become quite long. It is
234  * often hard to see the order in which steps are taken, and even
235  * harder to tell if detach/cleanup undoes them in the correct order,
236  * or even if it undoes them at all! The only sure way to understand
237  * the flow is to take good notes while closely inspecting each line
238  * of code. Even then, it's easy for attach and detach to get out of
239  * sync.
240  *
241  * Some more recent drivers have improved on this situation by using a
242  * bit vector to track the sequence of events in attach/detach. Each
243  * bit is declared in as an enum value, in the same order it is
244  * expected attach would run, and thus detach would run in the exact
245  * opposite order. This has three main benefits:
246  *
247  *    1. It makes it easier to determine sequence order at a
248  *       glance.
249  *
250  *    2. It gives a better idea of what state the device is in during
251  *       debugging (the sequence bit vector is kept with the instance
252  *       state).
253  *
254  *    3. The detach function can verify that all sequence bits are
255  *       cleared, indicating that everything done in attach was
256  *       successfully undone.
257  *
258  * These are great improvements. However, the attach/detach functions
259  * can still become unruly, and there is still no guarantee that
260  * detach is done in opposite order of attach (this is not always
261  * strictly required, but is probably the best way to write detach).
262  * There is still a lot of boilerplate and chance for programmer
263  * error.
264  *
265  * The ena driver takes the sequence idea a bit further, creating a
266  * descriptor table of the attach sequence (ena_attach_tbl). This
267  * table is used by attach/detach to generically, declaratively, and
268  * programmatically enforce the precise sequence order and verify that
269  * anything that is done is undone. This provides several benefits:
270  *
271  *    o Correct order is enforced implicitly by the descriptor table.
272  *      It is impossible for the detach sequence to run in any other
273  *      order other than opposite that of attach.
274  *
275  *    o It is obvious what the precise attach sequence is. While the
276  *      bit vector enum helps a lot with this it doesn't prevent
277  *      programmer error. With the sequence defined as a declarative
278  *      table it makes it easy for the programmer to see the order and
279  *      know it's followed exactly.
280  *
281  *    o It is impossible to modify the attach sequence without also
282  *      specifying a callback for its dual in the detach sequence.
283  *
284  *    o Common and repetitive code like error checking, logging, and bit
285  *      vector modification is eliminated and centralized, again
286  *      reducing the chance of programmer error.
287  *
288  * The ena attach sequence is defined under ena_attach_seq_t. The
289  * descriptor table is defined under ena_attach_tbl.
290  */
291 
292 /*
293  * These are some basic data layout invariants on which development
294  * assumptions where made.
295  */
296 CTASSERT(sizeof (enahw_aenq_desc_t) == 64);
297 /* TODO: Why doesn't this work? */
298 /* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */
299 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t));
300 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t));
301 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t));
302 /*
303  * We add this here as an extra safety check to make sure that any
304  * addition to the AENQ group enum also updates the groups array num
305  * value.
306  */
307 CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6);
308 
309 /*
310  * Amazon does not specify the endianess of the ENA device. We assume
311  * it's the same as the bus, and we assume the CPU/bus is always
312  * little endian.
313  */
314 #ifdef _BIG_ENDIAN
315 #error "ENA driver is little-endian only"
316 #endif
317 
318 /*
319  * These values are used to communicate the driver version to the AWS
320  * hypervisor via the ena_set_host_info() function. We don't know what
321  * exactly AWS does with this info, but it's fairly safe to assume
322  * it's used solely for debug/informational purposes. The Linux driver
323  * updates these values frequently as bugs are fixed and features are
324  * added.
325  */
326 #define	ENA_DRV_VER_MAJOR	1
327 #define	ENA_DRV_VER_MINOR	0
328 #define	ENA_DRV_VER_SUBMINOR	0
329 
330 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
331 
332 /*
333  * Log an error message. We leave the destination (console or system
334  * log) up to the caller
335  */
336 void
337 ena_err(const ena_t *ena, const char *fmt, ...)
338 {
339 	va_list ap;
340 
341 	va_start(ap, fmt);
342 	if (ena != NULL && ena->ena_dip != NULL) {
343 		vdev_err(ena->ena_dip, CE_WARN, fmt, ap);
344 	} else {
345 		vcmn_err(CE_WARN, fmt, ap);
346 	}
347 	va_end(ap);
348 }
349 
350 /*
351  * Set this to B_TRUE to enable debug messages.
352  */
353 boolean_t ena_debug = B_FALSE;
354 
355 /*
356  * Log a debug message. We force all debug messages to go to the
357  * system log.
358  */
359 void
360 ena_dbg(const ena_t *ena, const char *fmt, ...)
361 {
362 	va_list ap;
363 
364 	if (ena_debug) {
365 		char msg[1024];
366 
367 		va_start(ap, fmt);
368 		(void) vsnprintf(msg, sizeof (msg), fmt, ap);
369 		va_end(ap);
370 
371 		if (ena != NULL && ena->ena_dip != NULL) {
372 			dev_err(ena->ena_dip, CE_NOTE, "!%s", msg);
373 		} else {
374 			cmn_err(CE_NOTE, "!%s", msg);
375 		}
376 	}
377 }
378 
379 ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = {
380 	{ .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" },
381 	{ .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" },
382 	{ .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" },
383 	{
384 		.eag_type = ENAHW_AENQ_GROUP_NOTIFICATION,
385 		.eag_str = "NOTIFICATION"
386 	},
387 	{ .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" },
388 	{
389 		.eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES,
390 		.eag_str = "REFRESH CAPABILITIES"
391 	},
392 };
393 
394 void
395 ena_aenq_work(ena_t *ena)
396 {
397 	ena_aenq_t *aenq = &ena->ena_aenq;
398 	uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
399 	boolean_t processed = B_FALSE;
400 	enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod];
401 	uint64_t ts;
402 
403 	ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low;
404 	ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL);
405 
406 	while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) {
407 		ena_aenq_hdlr_t hdlr;
408 
409 		ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM);
410 		processed = B_TRUE;
411 		ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64
412 		    " us", desc->ead_group,
413 		    ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome,
414 		    ts);
415 
416 		hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group];
417 		hdlr(ena, desc);
418 
419 		aenq->eaenq_head++;
420 		head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
421 
422 		if (head_mod == 0) {
423 			aenq->eaenq_phase ^= 1;
424 		}
425 
426 		desc = &aenq->eaenq_descs[head_mod];
427 	}
428 
429 	if (processed) {
430 		ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
431 		    aenq->eaenq_head);
432 	}
433 }
434 
435 /*
436  * Use for attach sequences which perform no resource allocation (or
437  * global state modification) and thus require no subsequent
438  * deallocation.
439  */
440 static void
441 ena_no_cleanup(ena_t *ena)
442 {
443 }
444 
445 static boolean_t
446 ena_attach_pci(ena_t *ena)
447 {
448 	ddi_acc_handle_t hdl;
449 
450 	if (pci_config_setup(ena->ena_dip, &hdl) != 0) {
451 		return (B_FALSE);
452 	}
453 
454 	ena->ena_pci_hdl = hdl;
455 	ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID);
456 	ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID);
457 	ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID);
458 	ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID);
459 	ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID);
460 	ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x",
461 	    ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev,
462 	    ena->ena_pci_svid, ena->ena_pci_sdid);
463 
464 	return (B_TRUE);
465 }
466 
467 static void
468 ena_cleanup_pci(ena_t *ena)
469 {
470 	pci_config_teardown(&ena->ena_pci_hdl);
471 }
472 
473 static void
474 ena_cleanup_regs_map(ena_t *ena)
475 {
476 	ddi_regs_map_free(&ena->ena_reg_hdl);
477 }
478 
479 static boolean_t
480 ena_attach_regs_map(ena_t *ena)
481 {
482 	int ret = 0;
483 
484 	if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) !=
485 	    DDI_SUCCESS) {
486 		ena_err(ena, "failed to get register set %d size",
487 		    ENA_REG_NUMBER);
488 		return (B_FALSE);
489 	}
490 
491 	ena_dbg(ena, "register size: %ld", ena->ena_reg_size);
492 	bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr));
493 	ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1;
494 	ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
495 	ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
496 
497 	/*
498 	 * This function can return several different failure values,
499 	 * so we make sure to capture its return value for the purpose
500 	 * of logging.
501 	 */
502 	ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER,
503 	    &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr,
504 	    &ena->ena_reg_hdl);
505 
506 	if (ret != DDI_SUCCESS) {
507 		ena_err(ena, "failed to map register set %d: %d",
508 		    ENA_REG_NUMBER, ret);
509 		return (B_FALSE);
510 	}
511 
512 	ena_dbg(ena, "registers mapped to base: 0x%p",
513 	    (void *)ena->ena_reg_base);
514 
515 	return (B_TRUE);
516 }
517 
518 /*
519  * Free any resources related to the admin submission queue.
520  */
521 static void
522 ena_admin_sq_free(ena_t *ena)
523 {
524 	ena_dma_free(&ena->ena_aq.ea_sq.eas_dma);
525 }
526 
527 /*
528  * Initialize the admin submission queue.
529  */
530 static boolean_t
531 ena_admin_sq_init(ena_t *ena)
532 {
533 	ena_adminq_t *aq = &ena->ena_aq;
534 	ena_dma_buf_t *dma = &aq->ea_sq.eas_dma;
535 	size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries);
536 	uint32_t addr_low, addr_high, wval;
537 	ena_dma_conf_t conf = {
538 		.edc_size = size,
539 		.edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT,
540 		.edc_sgl = 1,
541 		.edc_endian = DDI_NEVERSWAP_ACC,
542 		.edc_stream = B_FALSE,
543 	};
544 
545 	if (!ena_dma_alloc(ena, dma, &conf, size)) {
546 		ena_err(ena, "failed to allocate DMA for Admin SQ");
547 		return (B_FALSE);
548 	}
549 
550 	aq->ea_sq.eas_entries = (void *)dma->edb_va;
551 	aq->ea_sq.eas_tail = 0;
552 	aq->ea_sq.eas_phase = 1;
553 	aq->ea_sq.eas_dbaddr =
554 	    (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB);
555 	ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
556 	addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
557 	addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
558 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low);
559 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high);
560 	wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) |
561 	    ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries));
562 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval);
563 	return (B_TRUE);
564 }
565 
566 /*
567  * Free any resources related to the admin completion queue.
568  */
569 static void
570 ena_admin_cq_free(ena_t *ena)
571 {
572 	ena_dma_free(&ena->ena_aq.ea_cq.eac_dma);
573 }
574 
575 /*
576  * Initialize the admin completion queue.
577  */
578 static boolean_t
579 ena_admin_cq_init(ena_t *ena)
580 {
581 	ena_adminq_t *aq = &ena->ena_aq;
582 	ena_dma_buf_t *dma = &aq->ea_cq.eac_dma;
583 	size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries);
584 	uint32_t addr_low, addr_high, wval;
585 	ena_dma_conf_t conf = {
586 		.edc_size = size,
587 		.edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT,
588 		.edc_sgl = 1,
589 		.edc_endian = DDI_NEVERSWAP_ACC,
590 		.edc_stream = B_FALSE,
591 	};
592 
593 	if (!ena_dma_alloc(ena, dma, &conf, size)) {
594 		ena_err(ena, "failed to allocate DMA for Admin CQ");
595 		return (B_FALSE);
596 	}
597 
598 	aq->ea_cq.eac_entries = (void *)dma->edb_va;
599 	aq->ea_cq.eac_head = 0;
600 	aq->ea_cq.eac_phase = 1;
601 	ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
602 	addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
603 	addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
604 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low);
605 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high);
606 	wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) |
607 	    ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries));
608 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval);
609 	return (B_TRUE);
610 }
611 
612 static void
613 ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc)
614 {
615 	ena_t *ena = data;
616 
617 	ena->ena_aenq_stat.eaes_default.value.ui64++;
618 	ena_dbg(ena, "unimplemented handler for aenq group: %s",
619 	    ena_groups_str[desc->ead_group].eag_str);
620 }
621 
622 static void
623 ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc)
624 {
625 	ena_t *ena = data;
626 	boolean_t is_up = (desc->ead_payload.link_change.flags &
627 	    ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0;
628 
629 	/*
630 	 * The interrupts are not enabled until after we register mac,
631 	 * so the mac handle should be valid.
632 	 */
633 	ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER);
634 	ena->ena_aenq_stat.eaes_link_change.value.ui64++;
635 
636 	mutex_enter(&ena->ena_lock);
637 
638 	/*
639 	 * Notify mac only on an actual change in status.
640 	 */
641 	if (ena->ena_link_up != is_up) {
642 		if (is_up) {
643 			mac_link_update(ena->ena_mh, LINK_STATE_UP);
644 		} else {
645 			mac_link_update(ena->ena_mh, LINK_STATE_DOWN);
646 		}
647 	}
648 
649 	ena->ena_link_up = is_up;
650 
651 	mutex_exit(&ena->ena_lock);
652 }
653 
654 /*
655  * Free any resources related to the Async Event Notification Queue.
656  */
657 static void
658 ena_aenq_free(ena_t *ena)
659 {
660 	ena_dma_free(&ena->ena_aenq.eaenq_dma);
661 }
662 
663 static void
664 ena_aenq_set_def_hdlrs(ena_aenq_t *aenq)
665 {
666 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr;
667 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr;
668 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr;
669 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] =
670 	    ena_aenq_default_hdlr;
671 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr;
672 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] =
673 	    ena_aenq_default_hdlr;
674 }
675 /*
676  * Initialize the Async Event Notification Queue.
677  */
678 static boolean_t
679 ena_aenq_init(ena_t *ena)
680 {
681 	ena_aenq_t *aenq = &ena->ena_aenq;
682 	size_t size;
683 	uint32_t addr_low, addr_high, wval;
684 	ena_dma_conf_t conf;
685 
686 	aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS;
687 	size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs);
688 
689 	/* BEGIN CSTYLED */
690 	conf = (ena_dma_conf_t) {
691 		.edc_size = size,
692 		.edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT,
693 		.edc_sgl = 1,
694 		.edc_endian = DDI_NEVERSWAP_ACC,
695 		.edc_stream = B_FALSE,
696 	};
697 	/* END CSTYLED */
698 
699 	if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) {
700 		ena_err(ena, "failed to allocate DMA for AENQ");
701 		return (B_FALSE);
702 	}
703 
704 	aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va;
705 	aenq->eaenq_head = 0;
706 	aenq->eaenq_phase = 1;
707 	bzero(aenq->eaenq_descs, size);
708 	ena_aenq_set_def_hdlrs(aenq);
709 
710 	aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] =
711 	    ena_aenq_link_change_hdlr;
712 
713 	ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress);
714 	addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress);
715 	addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32);
716 	ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low);
717 	ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high);
718 	ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV);
719 	wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) |
720 	    ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs));
721 	ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval);
722 	return (B_TRUE);
723 }
724 
725 /*
726  * We limit the max number of I/O queues based on several aspects of
727  * the underlying hardware.
728  *
729  * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES,
730  *    which comes from the common code and presumably is based on device
731  *    constraints.
732  *
733  * 2. Next we latch the number of I/O queues to the number of online
734  *    CPUs. The idea being that each queue is a parallel work stream,
735  *    and having more queues than CPUs to flush them will not improve
736  *    performance. The number of online CPUs can change dynamically,
737  *    and that's okay, everything should still work fine, it just
738  *    might not be ideal.
739  *
740  * 3. Next we latch the number of I/O queues to the smallest of the
741  *    max Tx queues and max Rx queues. We could probably loosen this
742  *    restriction in the future, and have separate max I/O queues for
743  *    Tx and Rx. This is what Linux does, and seems like a fine place
744  *    to start.
745  */
746 static void
747 ena_set_max_io_queues(ena_t *ena)
748 {
749 	uint32_t max = ENAHW_MAX_NUM_IO_QUEUES;
750 
751 	max = MIN(ncpus_online, max);
752 	/*
753 	 * Supposedly a device could present a different number of SQs
754 	 * and CQs. This driver is designed in a way that requires
755 	 * each SQ to have a corresponding and dedicated CQ (how would
756 	 * it work otherwise). Therefore, we must check both values
757 	 * and find the minimum between them.
758 	 */
759 	max = MIN(ena->ena_tx_max_sq_num, max);
760 	max = MIN(ena->ena_tx_max_cq_num, max);
761 	max = MIN(ena->ena_rx_max_sq_num, max);
762 	max = MIN(ena->ena_rx_max_cq_num, max);
763 
764 
765 	/* This shouldn't happen, but just in case. */
766 	if (max == 0) {
767 		max = 1;
768 	}
769 
770 	ena->ena_max_io_queues = max;
771 }
772 
773 /*
774  * We require that an Rx or Tx buffer be able to hold the maximum MTU
775  * along with the maximum frame header length. In this case we know
776  * ENA is presenting us an Ethernet frame so we add the size of an
777  * Ethernet VLAN header. Rx has the additional requirement of needing
778  * additional margin for the sake of IP header alignment.
779  */
780 static void
781 ena_update_buf_sizes(ena_t *ena)
782 {
783 	ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header);
784 	ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu;
785 	ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total,
786 	    ena->ena_page_sz, uint32_t);
787 	ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total +
788 	    ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t);
789 }
790 
791 static boolean_t
792 ena_get_offloads(ena_t *ena)
793 {
794 	int ret = 0;
795 	enahw_resp_desc_t resp;
796 	enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload;
797 
798 	ena->ena_tx_l3_ipv4_csum = B_FALSE;
799 
800 	ena->ena_tx_l4_ipv4_part_csum = B_FALSE;
801 	ena->ena_tx_l4_ipv4_full_csum = B_FALSE;
802 	ena->ena_tx_l4_ipv4_lso = B_FALSE;
803 
804 	ena->ena_tx_l4_ipv6_part_csum = B_FALSE;
805 	ena->ena_tx_l4_ipv6_full_csum = B_FALSE;
806 	ena->ena_tx_l4_ipv6_lso = B_FALSE;
807 
808 	ena->ena_rx_l3_ipv4_csum = B_FALSE;
809 	ena->ena_rx_l4_ipv4_csum = B_FALSE;
810 	ena->ena_rx_l4_ipv6_csum = B_FALSE;
811 	ena->ena_rx_hash = B_FALSE;
812 
813 	bzero(&resp, sizeof (resp));
814 	ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG,
815 	    ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER);
816 
817 	if (ret == ENOTSUP) {
818 		/*
819 		 * In this case the device does not support querying
820 		 * for hardware offloads. We take that as a sign that
821 		 * the device provides no offloads.
822 		 */
823 		return (B_TRUE);
824 	} else if (ret != 0) {
825 		ena_err(ena, "error getting stateless offload: %d", ret);
826 		return (B_FALSE);
827 	}
828 
829 	ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat);
830 
831 	ena->ena_tx_l4_ipv4_part_csum =
832 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat);
833 	ena->ena_tx_l4_ipv4_full_csum =
834 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat);
835 	ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat);
836 
837 	ena->ena_tx_l4_ipv6_part_csum =
838 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat);
839 	ena->ena_tx_l4_ipv6_full_csum =
840 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat);
841 	ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat);
842 
843 	ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat);
844 	ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat);
845 	ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat);
846 	return (B_TRUE);
847 }
848 
849 static int
850 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval,
851     const int defval)
852 {
853 	int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip,
854 	    DDI_PROP_DONTPASS, propname, defval);
855 
856 	if (value > maxval) {
857 		ena_err(ena, "user value %s=%d exceeded maximum, setting to %d",
858 		    propname, value, maxval);
859 		value = maxval;
860 	}
861 
862 	if (value < minval) {
863 		ena_err(ena, "user value %s=%d below minimum, setting to %d",
864 		    propname, value, minval);
865 		value = minval;
866 	}
867 
868 	return (value);
869 }
870 
871 static boolean_t
872 ena_set_mtu(ena_t *ena)
873 {
874 	int ret = 0;
875 	enahw_cmd_desc_t cmd;
876 	enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu;
877 	enahw_resp_desc_t resp;
878 
879 	bzero(&cmd, sizeof (cmd));
880 	bzero(&resp, sizeof (resp));
881 	feat->efm_mtu = ena->ena_mtu;
882 
883 	if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU,
884 	    ENAHW_FEAT_MTU_VER)) != 0) {
885 		ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu,
886 		    ret);
887 		return (B_FALSE);
888 	}
889 
890 	return (B_TRUE);
891 }
892 
893 static void
894 ena_get_link_config(ena_t *ena)
895 {
896 	enahw_resp_desc_t resp;
897 	enahw_feat_link_conf_t *feat =
898 	    &resp.erd_resp.erd_get_feat.ergf_link_conf;
899 	boolean_t full_duplex;
900 
901 	bzero(&resp, sizeof (resp));
902 
903 	if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG,
904 	    ENAHW_FEAT_LINK_CONFIG_VER) != 0) {
905 		/*
906 		 * Some ENA devices do no support this feature. In
907 		 * those cases we report a 1Gbps link, full duplex.
908 		 * For the most accurate information on bandwidth
909 		 * limits see the official AWS documentation.
910 		 */
911 		ena->ena_link_speed_mbits = 1 * 1000 * 1000;
912 		ena->ena_link_speeds = ENAHW_LINK_SPEED_1G;
913 		ena->ena_link_duplex = LINK_DUPLEX_FULL;
914 		ena->ena_link_autoneg = B_TRUE;
915 		return;
916 	}
917 
918 	ena->ena_link_speed_mbits = feat->eflc_speed;
919 	ena->ena_link_speeds = feat->eflc_supported;
920 	full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat);
921 	ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL :
922 	    LINK_DUPLEX_HALF;
923 	ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat);
924 }
925 
926 /*
927  * Retrieve all configuration values which are modifiable via
928  * ena.conf, and set ena_t members accordingly. While the conf values
929  * have priority, they may be implicitly modified by the driver to
930  * meet resource constraints on a given platform. If no value is
931  * specified in the conf file, the driver will attempt to use the
932  * largest value supported. While there should be no value large
933  * enough, keep in mind that ena_get_prop() will cast the values to an
934  * int.
935  *
936  * This function should be called after the device is initialized,
937  * admin queue is established, and the hardware features/capabs have
938  * been queried; it should be called before mac registration.
939  */
940 static boolean_t
941 ena_attach_read_conf(ena_t *ena)
942 {
943 	uint32_t gcv;	/* Greatest Common Value */
944 
945 	/*
946 	 * We expect that the queue lengths are the same for both the
947 	 * CQ and SQ, but technically the device could return
948 	 * different lengths. For now the driver locks them together.
949 	 */
950 	gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs);
951 	ASSERT3U(gcv, <=, INT_MAX);
952 	ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS,
953 	    ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv);
954 
955 	ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT,
956 	    ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX,
957 	    ENA_PROP_RXQ_INTR_LIMIT_DEF);
958 
959 	gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs);
960 	ASSERT3U(gcv, <=, INT_MAX);
961 	ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS,
962 	    ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv);
963 
964 	return (B_TRUE);
965 }
966 
967 /*
968  * Perform any necessary device configuration after the driver.conf
969  * has been read.
970  */
971 static boolean_t
972 ena_attach_dev_cfg(ena_t *ena)
973 {
974 	ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF);
975 
976 	if (!ena_set_mtu(ena)) {
977 		/*
978 		 * We don't expect this to fail, but we try a fallback
979 		 * first before failing the attach sequence.
980 		 */
981 		ena->ena_mtu = 1500;
982 		ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu);
983 
984 		if (!ena_set_mtu(ena)) {
985 			return (B_FALSE);
986 		}
987 	}
988 
989 	return (B_TRUE);
990 }
991 
992 static boolean_t
993 ena_check_versions(ena_t *ena)
994 {
995 	uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION);
996 	uint32_t ctrl_vsn =
997 	    ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION);
998 
999 	ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn);
1000 	ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn);
1001 
1002 	ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn);
1003 	ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn);
1004 	ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn);
1005 	ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn);
1006 
1007 	ena_dbg(ena, "device version: %u.%u",
1008 	    ena->ena_dev_major_vsn, ena->ena_dev_minor_vsn);
1009 	ena_dbg(ena, "controller version: %u.%u.%u implementation %u",
1010 	    ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1011 	    ena->ena_ctrl_subminor_vsn, ena->ena_ctrl_impl_id);
1012 
1013 	if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) {
1014 		ena_err(ena, "unsupported controller version: %u.%u.%u",
1015 		    ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1016 		    ena->ena_ctrl_subminor_vsn);
1017 		return (B_FALSE);
1018 	}
1019 
1020 	return (B_TRUE);
1021 }
1022 
1023 boolean_t
1024 ena_setup_aenq(ena_t *ena)
1025 {
1026 	enahw_cmd_desc_t cmd;
1027 	enahw_feat_aenq_t *cmd_feat =
1028 	    &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq;
1029 	enahw_resp_desc_t resp;
1030 	enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq;
1031 	enahw_aenq_groups_t to_enable;
1032 
1033 	bzero(&resp, sizeof (resp));
1034 	if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
1035 	    ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1036 		return (B_FALSE);
1037 	}
1038 
1039 	to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) |
1040 	    BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) |
1041 	    BIT(ENAHW_AENQ_GROUP_WARNING) |
1042 	    BIT(ENAHW_AENQ_GROUP_NOTIFICATION);
1043 	to_enable &= resp_feat->efa_supported_groups;
1044 
1045 	bzero(&cmd, sizeof (cmd));
1046 	bzero(&resp, sizeof (cmd));
1047 	cmd_feat->efa_enabled_groups = to_enable;
1048 
1049 	if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG,
1050 	    ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1051 		return (B_FALSE);
1052 	}
1053 
1054 	bzero(&resp, sizeof (resp));
1055 	if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
1056 	    ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
1057 		return (B_FALSE);
1058 	}
1059 
1060 	ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups;
1061 	ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups;
1062 
1063 	for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) {
1064 		ena_aenq_grpstr_t *grpstr = &ena_groups_str[i];
1065 		boolean_t supported = BIT(grpstr->eag_type) &
1066 		    resp_feat->efa_supported_groups;
1067 		boolean_t enabled = BIT(grpstr->eag_type) &
1068 		    resp_feat->efa_enabled_groups;
1069 
1070 		ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str,
1071 		    supported ? "Y" : "N", enabled ? "Y" : "N");
1072 	}
1073 
1074 	return (B_TRUE);
1075 }
1076 
1077 /*
1078  * Free all resources allocated as part of ena_device_init().
1079  */
1080 static void
1081 ena_cleanup_device_init(ena_t *ena)
1082 {
1083 	ena_adminq_t *aq = &ena->ena_aq;
1084 
1085 	ena_free_host_info(ena);
1086 	mutex_destroy(&aq->ea_sq_lock);
1087 	mutex_destroy(&aq->ea_cq_lock);
1088 	mutex_destroy(&aq->ea_stat_lock);
1089 	list_destroy(&aq->ea_cmd_ctxs_free);
1090 	kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen);
1091 	ena_admin_sq_free(ena);
1092 	ena_admin_cq_free(ena);
1093 	ena_aenq_free(ena);
1094 	ena_stat_device_basic_cleanup(ena);
1095 	ena_stat_device_extended_cleanup(ena);
1096 	ena_stat_aenq_cleanup(ena);
1097 }
1098 
1099 static boolean_t
1100 ena_attach_device_init(ena_t *ena)
1101 {
1102 	ena_adminq_t *aq = &ena->ena_aq;
1103 	uint32_t rval, wval;
1104 	uint8_t dma_width;
1105 	hrtime_t timeout, cmd_timeout;
1106 	hrtime_t expired;
1107 	enahw_resp_desc_t resp;
1108 	enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr;
1109 	uint8_t *maddr;
1110 	uint32_t supported_features;
1111 	int ret = 0;
1112 
1113 	rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1114 	if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) {
1115 		ena_err(ena, "device is not ready");
1116 		return (B_FALSE);
1117 	}
1118 
1119 	rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1120 
1121 	/*
1122 	 * The device stores the reset timeout at 100ms resolution; we
1123 	 * normalize that to nanoseconds.
1124 	 */
1125 	timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100);
1126 
1127 	if (timeout == 0) {
1128 		ena_err(ena, "device gave invalid reset timeout");
1129 		return (B_FALSE);
1130 	}
1131 
1132 	expired = gethrtime() + timeout;
1133 
1134 	wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
1135 	wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
1136 	    ENAHW_DEV_CTL_RESET_REASON_MASK;
1137 	ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval);
1138 
1139 	/*
1140 	 * Make sure reset is in progress.
1141 	 */
1142 	while (1) {
1143 		rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1144 
1145 		if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) {
1146 			break;
1147 		}
1148 
1149 		if (gethrtime() > expired) {
1150 			ena_err(ena, "device reset start timed out");
1151 			return (B_FALSE);
1152 		}
1153 
1154 		/* Sleep for 100 milliseconds. */
1155 		delay(drv_usectohz(100 * 1000));
1156 	}
1157 
1158 	/*
1159 	 * Reset the timeout counter for the next device request.
1160 	 */
1161 	expired = gethrtime() + timeout;
1162 
1163 	/*
1164 	 * Wait for the device reset to finish.
1165 	 */
1166 	ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0);
1167 	while (1) {
1168 		rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
1169 
1170 		if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) {
1171 			break;
1172 		}
1173 
1174 		if (gethrtime() > expired) {
1175 			ena_err(ena, "device reset timed out");
1176 			return (B_FALSE);
1177 		}
1178 
1179 		/* Sleep for 100 milliseconds. */
1180 		delay(drv_usectohz(100 * 1000));
1181 	}
1182 
1183 	if (!ena_check_versions(ena)) {
1184 		return (B_FALSE);
1185 	}
1186 
1187 	rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1188 	dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval);
1189 	ena->ena_dma_width = dma_width;
1190 
1191 	/*
1192 	 * As we are not using an interrupt for admin queue completion
1193 	 * signaling, we do not need a priority on these mutexes. If
1194 	 * that changes, we will have to rejigger some code to create
1195 	 * the admin queue interrupt before this function.
1196 	 */
1197 	mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL);
1198 	mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL);
1199 	mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL);
1200 	aq->ea_qlen = ENA_ADMINQ_DEPTH;
1201 	aq->ea_pending_cmds = 0;
1202 
1203 	aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen,
1204 	    KM_SLEEP);
1205 	list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t),
1206 	    offsetof(ena_cmd_ctx_t, ectx_node));
1207 
1208 	for (uint_t i = 0; i < aq->ea_qlen; i++) {
1209 		ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i];
1210 
1211 		ctx->ectx_id = i;
1212 		ctx->ectx_pending = B_FALSE;
1213 		ctx->ectx_cmd_opcode = ENAHW_CMD_NONE;
1214 		ctx->ectx_resp = NULL;
1215 		list_insert_tail(&aq->ea_cmd_ctxs_free, ctx);
1216 	}
1217 
1218 	/*
1219 	 * The value stored in the device register is in the
1220 	 * resolution of 100 milliseconds. We normalize that to
1221 	 * nanoseconds.
1222 	 */
1223 	cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100);
1224 	aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns);
1225 
1226 	if (aq->ea_cmd_timeout_ns == 0) {
1227 		aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
1228 	}
1229 
1230 	if (!ena_admin_sq_init(ena)) {
1231 		return (B_FALSE);
1232 	}
1233 
1234 	if (!ena_admin_cq_init(ena)) {
1235 		return (B_FALSE);
1236 	}
1237 
1238 	if (!ena_aenq_init(ena)) {
1239 		return (B_FALSE);
1240 	}
1241 
1242 	/*
1243 	 * Start in polling mode until we've determined the number of queues
1244 	 * and are ready to configure and enable interrupts.
1245 	 */
1246 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK);
1247 	aq->ea_poll_mode = B_TRUE;
1248 
1249 	bzero(&resp, sizeof (resp));
1250 	ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES,
1251 	    ENAHW_FEAT_DEVICE_ATTRIBUTES_VER);
1252 
1253 	if (ret != 0) {
1254 		ena_err(ena, "failed to get device attributes: %d", ret);
1255 		return (B_FALSE);
1256 	}
1257 
1258 	ena_dbg(ena, "impl ID: %u", feat->efda_impl_id);
1259 	ena_dbg(ena, "device version: %u", feat->efda_device_version);
1260 	ena_dbg(ena, "supported features: 0x%x",
1261 	    feat->efda_supported_features);
1262 	ena_dbg(ena, "device capabilities: 0x%x", feat->efda_capabilities);
1263 	ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width);
1264 	ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with);
1265 	maddr = feat->efda_mac_addr;
1266 	ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1],
1267 	    maddr[2], maddr[3], maddr[4], maddr[5]);
1268 	ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu);
1269 
1270 	bcopy(maddr, ena->ena_mac_addr, ETHERADDRL);
1271 	ena->ena_max_mtu = feat->efda_max_mtu;
1272 	ena->ena_capabilities = feat->efda_capabilities;
1273 	supported_features = feat->efda_supported_features;
1274 	ena->ena_supported_features = supported_features;
1275 	feat = NULL;
1276 	bzero(&resp, sizeof (resp));
1277 
1278 	if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) {
1279 		enahw_feat_max_queue_ext_t *feat_mqe =
1280 		    &resp.erd_resp.erd_get_feat.ergf_max_queue_ext;
1281 
1282 		ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT,
1283 		    ENAHW_FEAT_MAX_QUEUES_EXT_VER);
1284 
1285 		if (ret != 0) {
1286 			ena_err(ena, "failed to query max queues ext: %d", ret);
1287 			return (B_FALSE);
1288 		}
1289 
1290 		ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num;
1291 		ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth;
1292 		ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num;
1293 		ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth;
1294 		ena->ena_tx_max_desc_per_pkt =
1295 		    feat_mqe->efmqe_max_per_packet_tx_descs;
1296 		ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size;
1297 
1298 		ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num;
1299 		ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth;
1300 		ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num;
1301 		ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth;
1302 		ena->ena_rx_max_desc_per_pkt =
1303 		    feat_mqe->efmqe_max_per_packet_rx_descs;
1304 
1305 		ena_set_max_io_queues(ena);
1306 	} else {
1307 		enahw_feat_max_queue_t *feat_mq =
1308 		    &resp.erd_resp.erd_get_feat.ergf_max_queue;
1309 
1310 		ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM,
1311 		    ENAHW_FEAT_MAX_QUEUES_NUM_VER);
1312 
1313 		if (ret != 0) {
1314 			ena_err(ena, "failed to query max queues: %d", ret);
1315 			return (B_FALSE);
1316 		}
1317 
1318 		ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num;
1319 		ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1320 		ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num;
1321 		ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1322 		ena->ena_tx_max_desc_per_pkt =
1323 		    feat_mq->efmq_max_per_packet_tx_descs;
1324 		ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size;
1325 
1326 		ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num;
1327 		ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1328 		ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num;
1329 		ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1330 		ena->ena_rx_max_desc_per_pkt =
1331 		    feat_mq->efmq_max_per_packet_rx_descs;
1332 
1333 		ena_set_max_io_queues(ena);
1334 	}
1335 
1336 	ena->ena_mtu = ena->ena_max_mtu;
1337 	ena_update_buf_sizes(ena);
1338 	/*
1339 	 * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL
1340 	 * sizes, for now we just force everything to use one
1341 	 * segment.
1342 	 */
1343 	ena->ena_tx_sgl_max_sz = 1;
1344 	ena->ena_rx_sgl_max_sz = 1;
1345 
1346 	if (!ena_init_host_info(ena)) {
1347 		return (B_FALSE);
1348 	}
1349 
1350 	if (!ena_setup_aenq(ena)) {
1351 		return (B_FALSE);
1352 	}
1353 
1354 	ena_get_link_config(ena);
1355 
1356 	if (!ena_get_offloads(ena)) {
1357 		return (B_FALSE);
1358 	}
1359 
1360 	if (!ena_stat_device_basic_init(ena)) {
1361 		return (B_FALSE);
1362 	}
1363 
1364 	if (!ena_stat_device_extended_init(ena)) {
1365 		return (B_FALSE);
1366 	}
1367 
1368 	if (!ena_stat_aenq_init(ena)) {
1369 		return (B_FALSE);
1370 	}
1371 
1372 	return (B_TRUE);
1373 }
1374 
1375 static void
1376 ena_cleanup_intr_alloc(ena_t *ena)
1377 {
1378 	for (int i = 0; i < ena->ena_num_intrs; i++) {
1379 		int ret = ddi_intr_free(ena->ena_intr_handles[i]);
1380 		if (ret != DDI_SUCCESS) {
1381 			ena_err(ena, "failed to free interrupt %d: %d", i, ret);
1382 		}
1383 	}
1384 
1385 	if (ena->ena_intr_handles != NULL) {
1386 		kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz);
1387 		ena->ena_intr_handles = NULL;
1388 		ena->ena_intr_handles_sz = 0;
1389 	}
1390 }
1391 
1392 /*
1393  * The Linux driver supports only MSI-X interrupts. We do the same,
1394  * with the assumption that it's the only type of interrupt the device
1395  * can present.
1396  */
1397 static boolean_t
1398 ena_attach_intr_alloc(ena_t *ena)
1399 {
1400 	int ret;
1401 	int types;
1402 	int min, req, ideal, avail, actual;
1403 
1404 	ret = ddi_intr_get_supported_types(ena->ena_dip, &types);
1405 	if (ret != DDI_SUCCESS) {
1406 		ena_err(ena, "failed to get interrupt types: %d", ret);
1407 		return (B_FALSE);
1408 	}
1409 
1410 	ena_dbg(ena, "supported interrupt types: 0x%x", types);
1411 	if ((types & DDI_INTR_TYPE_MSIX) == 0) {
1412 		ena_err(ena, "the ena driver only supports MSI-X interrupts");
1413 		return (B_FALSE);
1414 	}
1415 
1416 	/* One for I/O, one for adminq. */
1417 	min = 2;
1418 	ideal = ena->ena_max_io_queues + 1;
1419 	ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1420 	if (ret != DDI_SUCCESS) {
1421 		ena_err(ena, "failed to get number of MSI-X interrupts: %d",
1422 		    ret);
1423 		return (B_FALSE);
1424 	}
1425 
1426 	if (avail < min) {
1427 		ena_err(ena, "number of MSI-X interrupts is %d, but the driver "
1428 		    "requires a minimum of %d", avail, min);
1429 		return (B_FALSE);
1430 	}
1431 
1432 	ena_dbg(ena, "%d MSI-X interrupts available", avail);
1433 
1434 	ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1435 	if (ret != DDI_SUCCESS) {
1436 		ena_err(ena, "failed to get available interrupts: %d", ret);
1437 		return (B_FALSE);
1438 	}
1439 
1440 	if (avail < min) {
1441 		ena_err(ena, "number of available MSI-X interrupts is %d, "
1442 		    "but the driver requires a minimum of %d", avail, min);
1443 		return (B_FALSE);
1444 	}
1445 
1446 	req = MIN(ideal, avail);
1447 	ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t);
1448 	ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP);
1449 
1450 	ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles,
1451 	    DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL);
1452 	if (ret != DDI_SUCCESS) {
1453 		ena_err(ena, "failed to allocate %d MSI-X interrupts: %d",
1454 		    req, ret);
1455 		return (B_FALSE);
1456 	}
1457 
1458 	if (actual < min) {
1459 		ena_err(ena, "number of allocated interrupts is %d, but the "
1460 		    "driver requires a minimum of %d", actual, min);
1461 		return (B_FALSE);
1462 	}
1463 
1464 	ena->ena_num_intrs = actual;
1465 
1466 	ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps);
1467 	if (ret != DDI_SUCCESS) {
1468 		ena_err(ena, "failed to get interrupt capability: %d", ret);
1469 		return (B_FALSE);
1470 	}
1471 
1472 	ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri);
1473 	if (ret != DDI_SUCCESS) {
1474 		ena_err(ena, "failed to get interrupt priority: %d", ret);
1475 		return (B_FALSE);
1476 	}
1477 
1478 	ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u",
1479 	    actual, ena->ena_intr_caps, ena->ena_intr_pri);
1480 
1481 	/*
1482 	 * The ena_lock should not be held in the datapath, but it is
1483 	 * held as part of the AENQ handler, which runs in interrupt
1484 	 * context. Therefore, we delayed the initialization of this
1485 	 * mutex until after the interrupts are allocated.
1486 	 */
1487 	mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER,
1488 	    DDI_INTR_PRI(ena->ena_intr_pri));
1489 
1490 	return (B_TRUE);
1491 }
1492 
1493 /*
1494  * Allocate the parent Rx queue structures. More importantly, this is
1495  * NOT allocating the queue descriptors or data buffers. Those are
1496  * allocated on demand as queues are started.
1497  */
1498 static boolean_t
1499 ena_attach_alloc_rxqs(ena_t *ena)
1500 {
1501 	/* We rely on the interrupt priority for initializing the mutexes. */
1502 	VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1503 	ena->ena_num_rxqs = ena->ena_num_intrs - 1;
1504 	ASSERT3U(ena->ena_num_rxqs, >, 0);
1505 	ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs),
1506 	    KM_SLEEP);
1507 
1508 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1509 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1510 
1511 		rxq->er_rxqs_idx = i;
1512 		/* The 0th vector is for Admin + AENQ. */
1513 		rxq->er_intr_vector = i + 1;
1514 		rxq->er_mrh = NULL;
1515 
1516 		mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER,
1517 		    DDI_INTR_PRI(ena->ena_intr_pri));
1518 		mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER,
1519 		    DDI_INTR_PRI(ena->ena_intr_pri));
1520 
1521 		rxq->er_ena = ena;
1522 		rxq->er_sq_num_descs = ena->ena_rxq_num_descs;
1523 		rxq->er_cq_num_descs = ena->ena_rxq_num_descs;
1524 
1525 		if (!ena_stat_rxq_init(rxq)) {
1526 			return (B_FALSE);
1527 		}
1528 
1529 		if (!ena_alloc_rxq(rxq)) {
1530 			return (B_FALSE);
1531 		}
1532 	}
1533 
1534 	return (B_TRUE);
1535 }
1536 
1537 static void
1538 ena_cleanup_rxqs(ena_t *ena)
1539 {
1540 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1541 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1542 
1543 		ena_cleanup_rxq(rxq);
1544 		mutex_destroy(&rxq->er_lock);
1545 		mutex_destroy(&rxq->er_stat_lock);
1546 		ena_stat_rxq_cleanup(rxq);
1547 	}
1548 
1549 	kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs));
1550 }
1551 
1552 /*
1553  * Allocate the parent Tx queue structures. More importantly, this is
1554  * NOT allocating the queue descriptors or data buffers. Those are
1555  * allocated on demand as a queue is started.
1556  */
1557 static boolean_t
1558 ena_attach_alloc_txqs(ena_t *ena)
1559 {
1560 	/* We rely on the interrupt priority for initializing the mutexes. */
1561 	VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1562 	ena->ena_num_txqs = ena->ena_num_intrs - 1;
1563 	ASSERT3U(ena->ena_num_txqs, >, 0);
1564 	ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs),
1565 	    KM_SLEEP);
1566 
1567 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1568 		ena_txq_t *txq = &ena->ena_txqs[i];
1569 
1570 		txq->et_txqs_idx = i;
1571 		/* The 0th vector is for Admin + AENQ. */
1572 		txq->et_intr_vector = i + 1;
1573 		txq->et_mrh = NULL;
1574 
1575 		mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER,
1576 		    DDI_INTR_PRI(ena->ena_intr_pri));
1577 		mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER,
1578 		    DDI_INTR_PRI(ena->ena_intr_pri));
1579 
1580 		txq->et_ena = ena;
1581 		txq->et_sq_num_descs = ena->ena_txq_num_descs;
1582 		txq->et_cq_num_descs = ena->ena_txq_num_descs;
1583 
1584 		if (!ena_stat_txq_init(txq)) {
1585 			return (B_FALSE);
1586 		}
1587 
1588 		if (!ena_alloc_txq(txq)) {
1589 			return (B_FALSE);
1590 		}
1591 	}
1592 
1593 	return (B_TRUE);
1594 }
1595 
1596 static void
1597 ena_cleanup_txqs(ena_t *ena)
1598 {
1599 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1600 		ena_txq_t *txq = &ena->ena_txqs[i];
1601 
1602 		ena_cleanup_txq(txq);
1603 		mutex_destroy(&txq->et_lock);
1604 		mutex_destroy(&txq->et_stat_lock);
1605 		ena_stat_txq_cleanup(txq);
1606 	}
1607 
1608 	kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs));
1609 }
1610 
1611 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = {
1612 	{
1613 		.ead_seq = ENA_ATTACH_PCI,
1614 		.ead_name = "PCI config",
1615 		.ead_attach_fn = ena_attach_pci,
1616 		.ead_attach_hard_fail = B_TRUE,
1617 		.ead_cleanup_fn = ena_cleanup_pci,
1618 	},
1619 
1620 	{
1621 		.ead_seq = ENA_ATTACH_REGS,
1622 		.ead_name = "BAR mapping",
1623 		.ead_attach_fn = ena_attach_regs_map,
1624 		.ead_attach_hard_fail = B_TRUE,
1625 		.ead_cleanup_fn = ena_cleanup_regs_map,
1626 	},
1627 
1628 	{
1629 		.ead_seq = ENA_ATTACH_DEV_INIT,
1630 		.ead_name = "device initialization",
1631 		.ead_attach_fn = ena_attach_device_init,
1632 		.ead_attach_hard_fail = B_TRUE,
1633 		.ead_cleanup_fn = ena_cleanup_device_init,
1634 	},
1635 
1636 	{
1637 		.ead_seq = ENA_ATTACH_READ_CONF,
1638 		.ead_name = "ena.conf",
1639 		.ead_attach_fn = ena_attach_read_conf,
1640 		.ead_attach_hard_fail = B_TRUE,
1641 		.ead_cleanup_fn = ena_no_cleanup,
1642 	},
1643 
1644 	{
1645 		.ead_seq = ENA_ATTACH_DEV_CFG,
1646 		.ead_name = "device config",
1647 		.ead_attach_fn = ena_attach_dev_cfg,
1648 		.ead_attach_hard_fail = B_TRUE,
1649 		.ead_cleanup_fn = ena_no_cleanup,
1650 	},
1651 
1652 	{
1653 		.ead_seq = ENA_ATTACH_INTR_ALLOC,
1654 		.ead_name = "interrupt allocation",
1655 		.ead_attach_fn = ena_attach_intr_alloc,
1656 		.ead_attach_hard_fail = B_TRUE,
1657 		.ead_cleanup_fn = ena_cleanup_intr_alloc,
1658 	},
1659 
1660 	{
1661 		.ead_seq = ENA_ATTACH_INTR_HDLRS,
1662 		.ead_name = "interrupt handlers",
1663 		.ead_attach_fn = ena_intr_add_handlers,
1664 		.ead_attach_hard_fail = B_TRUE,
1665 		.ead_cleanup_fn = ena_intr_remove_handlers,
1666 	},
1667 
1668 	{
1669 		.ead_seq = ENA_ATTACH_TXQS_ALLOC,
1670 		.ead_name = "Tx queues",
1671 		.ead_attach_fn = ena_attach_alloc_txqs,
1672 		.ead_attach_hard_fail = B_TRUE,
1673 		.ead_cleanup_fn = ena_cleanup_txqs,
1674 	},
1675 
1676 	{
1677 		.ead_seq = ENA_ATTACH_RXQS_ALLOC,
1678 		.ead_name = "Rx queues",
1679 		.ead_attach_fn = ena_attach_alloc_rxqs,
1680 		.ead_attach_hard_fail = B_TRUE,
1681 		.ead_cleanup_fn = ena_cleanup_rxqs,
1682 	},
1683 
1684 	/*
1685 	 * The chance of mac_unregister() failure poses a problem to
1686 	 * cleanup. We address interrupt disablement and mac
1687 	 * unregistration explicitly in the attach/detach routines.
1688 	 */
1689 	{
1690 		.ead_seq = ENA_ATTACH_MAC_REGISTER,
1691 		.ead_name = "mac registration",
1692 		.ead_attach_fn = ena_mac_register,
1693 		.ead_attach_hard_fail = B_TRUE,
1694 		.ead_cleanup_fn = ena_no_cleanup,
1695 	},
1696 
1697 	{
1698 		.ead_seq = ENA_ATTACH_INTRS_ENABLE,
1699 		.ead_name = "enable interrupts",
1700 		.ead_attach_fn = ena_intrs_enable,
1701 		.ead_attach_hard_fail = B_TRUE,
1702 		.ead_cleanup_fn = ena_no_cleanup,
1703 	}
1704 };
1705 
1706 /*
1707  * This function undoes any work done by ena_attach(), either in
1708  * response to a failed attach or a planned detach. At the end of this
1709  * function ena_attach_seq should be zero, otherwise it means
1710  * something has not be freed/uninitialized.
1711  */
1712 static void
1713 ena_cleanup(ena_t *ena)
1714 {
1715 	if (ena == NULL || ena->ena_attach_seq == 0) {
1716 		return;
1717 	}
1718 
1719 	/*
1720 	 * We VERIFY this because if the seq is greater than entries
1721 	 * we drift into space and execute god knows what.
1722 	 */
1723 	VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES);
1724 
1725 	while (ena->ena_attach_seq > 0) {
1726 		int idx = ena->ena_attach_seq - 1;
1727 		ena_attach_desc_t *desc = &ena_attach_tbl[idx];
1728 
1729 		ena_dbg(ena, "running cleanup sequence: %s (%d)",
1730 		    desc->ead_name, idx);
1731 
1732 		desc->ead_cleanup_fn(ena);
1733 		ena->ena_attach_seq--;
1734 	}
1735 
1736 	ASSERT3U(ena->ena_attach_seq, ==, 0);
1737 	mutex_destroy(&ena->ena_lock);
1738 }
1739 
1740 static int
1741 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1742 {
1743 	ena_t *ena;
1744 
1745 	if (cmd != DDI_ATTACH) {
1746 		return (DDI_FAILURE);
1747 	}
1748 
1749 	ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP);
1750 	ena->ena_instance = ddi_get_instance(dip);
1751 	ena->ena_dip = dip;
1752 	ena->ena_instance = ddi_get_instance(dip);
1753 	ena->ena_page_sz = ddi_ptob(dip, 1);
1754 
1755 	for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) {
1756 		boolean_t success;
1757 		ena_attach_desc_t *desc = &ena_attach_tbl[i];
1758 
1759 		ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name,
1760 		    i);
1761 
1762 		if (!(success = desc->ead_attach_fn(ena))) {
1763 			ena_err(ena, "attach sequence failed: %s (%d)",
1764 			    desc->ead_name, i);
1765 
1766 			if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) {
1767 				/*
1768 				 * In this specific case
1769 				 * ENA_ATTACH_INTRS_ENABLE has failed,
1770 				 * and we may or may not be able to
1771 				 * unregister the mac, depending on if
1772 				 * something in userspace has created
1773 				 * a client on top.
1774 				 *
1775 				 * NOTE: Something that would be nice
1776 				 * to add to mac is the ability to
1777 				 * register a provider separate from
1778 				 * "publishing" it to the rest of the
1779 				 * system. This would allow a driver
1780 				 * to register its mac, do some
1781 				 * additional work that might fail,
1782 				 * and then unregister if that work
1783 				 * fails without concern for any
1784 				 * chance of failure when calling
1785 				 * unregister. This would remove the
1786 				 * complexity of the situation we are
1787 				 * trying to address here, as we would
1788 				 * know that until the mac has been
1789 				 * "published", there is no chance for
1790 				 * mac_unregister() to fail.
1791 				 */
1792 				if (ena_mac_unregister(ena) != 0) {
1793 					return (DDI_FAILURE);
1794 				}
1795 
1796 				ena->ena_attach_seq--;
1797 			} else {
1798 				/*
1799 				 * Since the ead_seq is predicated on
1800 				 * successful ead_attach_fn we must
1801 				 * run the specific cleanup handler
1802 				 * before calling the global cleanup
1803 				 * routine. This also means that all
1804 				 * cleanup functions must be able to
1805 				 * deal with partial success of the
1806 				 * corresponding ead_attach_fn.
1807 				 */
1808 				desc->ead_cleanup_fn(ena);
1809 			}
1810 
1811 			ena_cleanup(ena);
1812 			kmem_free(ena, sizeof (ena_t));
1813 			return (DDI_FAILURE);
1814 		}
1815 
1816 		if (success) {
1817 			ena_dbg(ena, "attach sequence completed: %s (%d)",
1818 			    desc->ead_name, i);
1819 		}
1820 
1821 		ena->ena_attach_seq = desc->ead_seq;
1822 	}
1823 
1824 	/*
1825 	 * Now that interrupts are enabled make sure to tell the
1826 	 * device that all AENQ descriptors are ready for writing, and
1827 	 * unmask the admin interrupt.
1828 	 *
1829 	 * Note that this interrupt is generated for both the admin queue and
1830 	 * the AENQ, but this driver always polls the admin queue. The surplus
1831 	 * interrupt for admin command completion triggers a harmless check of
1832 	 * the AENQ.
1833 	 */
1834 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK);
1835 	ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
1836 	    ena->ena_aenq.eaenq_num_descs);
1837 
1838 	ddi_set_driver_private(dip, ena);
1839 	return (DDI_SUCCESS);
1840 }
1841 
1842 static int
1843 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1844 {
1845 	ena_t *ena = ddi_get_driver_private(dip);
1846 
1847 	if (ena == NULL) {
1848 		return (DDI_FAILURE);
1849 	}
1850 
1851 	/*
1852 	 * Before we can proceed to cleanup we have to treat
1853 	 * mac_unregister() explicitly -- if there are still
1854 	 * outstanding clients, then we can't proceed with detach or
1855 	 * cleanup.
1856 	 */
1857 
1858 	/*
1859 	 * Why this would fail I don't know, but if we proceed to mac
1860 	 * unregister, then there is a good chance we will panic in
1861 	 * the Rx interrupt handler when calling mac_rx_ring()
1862 	 */
1863 	if (!ena_intrs_disable(ena)) {
1864 		return (DDI_FAILURE);
1865 	}
1866 
1867 	/* We can't detach if clients are actively using the device. */
1868 	if (ena_mac_unregister(ena) != 0) {
1869 		(void) ena_intrs_enable(ena);
1870 		return (DDI_FAILURE);
1871 	}
1872 
1873 	/*
1874 	 * At this point we can proceed with the rest of cleanup on a
1875 	 * best-effort basis.
1876 	 */
1877 	ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC;
1878 	ena_cleanup(ena);
1879 	ddi_set_driver_private(dip, NULL);
1880 	kmem_free(ena, sizeof (ena_t));
1881 	return (DDI_SUCCESS);
1882 }
1883 
1884 static struct cb_ops ena_cb_ops = {
1885 	.cb_open = nodev,
1886 	.cb_close = nodev,
1887 	.cb_strategy = nodev,
1888 	.cb_print = nodev,
1889 	.cb_dump = nodev,
1890 	.cb_read = nodev,
1891 	.cb_write = nodev,
1892 	.cb_ioctl = nodev,
1893 	.cb_devmap = nodev,
1894 	.cb_mmap = nodev,
1895 	.cb_segmap = nodev,
1896 	.cb_chpoll = nochpoll,
1897 	.cb_prop_op = ddi_prop_op,
1898 	.cb_flag = D_MP,
1899 	.cb_rev = CB_REV,
1900 	.cb_aread = nodev,
1901 	.cb_awrite = nodev
1902 };
1903 
1904 static struct dev_ops ena_dev_ops = {
1905 	.devo_rev = DEVO_REV,
1906 	.devo_refcnt = 0,
1907 	.devo_getinfo = NULL,
1908 	.devo_identify = nulldev,
1909 	.devo_probe = nulldev,
1910 	.devo_attach = ena_attach,
1911 	.devo_detach = ena_detach,
1912 	.devo_reset = nodev,
1913 	.devo_quiesce = ddi_quiesce_not_supported,
1914 	.devo_cb_ops = &ena_cb_ops
1915 };
1916 
1917 static struct modldrv ena_modldrv = {
1918 	.drv_modops = &mod_driverops,
1919 	.drv_linkinfo = "AWS ENA Ethernet",
1920 	.drv_dev_ops = &ena_dev_ops
1921 };
1922 
1923 static struct modlinkage ena_modlinkage = {
1924 	.ml_rev = MODREV_1,
1925 	.ml_linkage = { &ena_modldrv, NULL }
1926 };
1927 
1928 int
1929 _init(void)
1930 {
1931 	int ret;
1932 
1933 	mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME);
1934 
1935 	if ((ret = mod_install(&ena_modlinkage)) != 0) {
1936 		mac_fini_ops(&ena_dev_ops);
1937 		return (ret);
1938 	}
1939 
1940 	return (ret);
1941 }
1942 
1943 int
1944 _info(struct modinfo *modinfop)
1945 {
1946 	return (mod_info(&ena_modlinkage, modinfop));
1947 }
1948 
1949 int
1950 _fini(void)
1951 {
1952 	int ret;
1953 
1954 	if ((ret = mod_remove(&ena_modlinkage)) != 0) {
1955 		return (ret);
1956 	}
1957 
1958 	mac_fini_ops(&ena_dev_ops);
1959 	return (ret);
1960 }
1961