xref: /illumos-gate/usr/src/uts/common/io/ena/ena.c (revision 4d8d108f42a089b7b4441353f2ad7a75e1c7b31d)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 #include "ena_hw.h"
17 #include "ena.h"
18 
19 /*
20  * Elastic Network Adapter (ENA) Driver
21  * ------------------------------------
22  *
23  * The ena driver provides support for the AWS ENA device, also
24  * referred to as their "enhanced networking". This device is present
25  * on "Nitro"-based instances. It presents itself with the following
26  * PCI Vendor/Device IDs
27  *
28  *    o 1d0f:0ec2 -- ENA PF
29  *    o 1d0f:1ec2 -- ENA PF (Reserved)
30  *    o 1d0f:ec20 -- ENA VF
31  *    o 1d0f:ec21 -- ENA VF (Reserved)
32  *
33  * This driver provides support for only the essential features needed
34  * to drive traffic on an ENA device. Support for the following
35  * features IS NOT currently implemented.
36  *
37  *    o Admin Queue Interrupts: queue completion events are always polled
38  *    o FMA
39  *    o Rx checksum offloads
40  *    o Tx checksum offloads
41  *    o Tx DMA bind (borrow buffers)
42  *    o Rx DMA bind (loaned buffers)
43  *    o TSO
44  *    o RSS
45  *    o Low Latency Queues (LLQ)
46  *    o Support for different Tx completion policies
47  *    o More controlled Tx recycling and Rx refill
48  *
49  * Even without these features the ena driver should perform
50  * reasonably well.
51  *
52  * Driver vs. Hardware Types
53  * -------------------------
54  *
55  * To properly communicate with the ENA device the driver must
56  * populate memory (registers and buffers) with specific types. These
57  * types are defined by the device and are found under the "common"
58  * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have
59  * simplified this a bit by defining all device-specific types in the
60  * ena_hw.h file. Furthermore, all device-specific types are given an
61  * "enahw" prefix. This makes it clear when we are dealing with a
62  * device type and when we are dealing with a driver type.
63  *
64  * [1]: https://github.com/amzn/amzn-drivers
65  *
66  * Groups, Rings (Queues), and Interrupts
67  * --------------------------------------
68  *
69  * The ENA device presents one mac group. This single mac group
70  * represents the single unicast address that this device represents
71  * in your AWS instance. The ENA device presents no option for
72  * configuring additional MAC addresses, multicast, or promisc mode --
73  * you receive only what AWS wants you to receive.
74  *
75  * This single mac group may have one or more rings. The ENA driver
76  * refers to rings as queues, for no special reason other than it was
77  * the dominant language in the Linux and FreeBSD drivers, and it
78  * spilled over into this port. The upper bound on number of queues is
79  * presented by the device. However, we don't just go with whatever
80  * number of queues the device reports; but rather we limit the queues
81  * based on other factors such as an absolute maximum, number of
82  * online CPUs, and number of available interrupts. The upper bound is
83  * calculated by ena_set_max_io_queues(), and that is used and
84  * possibly further restricted in ena_attach_intr_alloc(). As this
85  * point, ultimately, it is the number of available interrupts (minus
86  * one for the admin queue) that determines the number of queues: one
87  * Tx and one Rx on each I/O interrupt.
88  *
89  * NOTE: Perhaps it is overly restrictive to limit the number of
90  * queues to the number of I/O interrupts. Something worth considering
91  * on larger instances if they present far less interrupts than they
92  * do queues + CPUs.
93  *
94  * The ENA device presents MSI-X interrupts only. During attach the
95  * driver queries the number of available interrupts and sets aside
96  * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N).
97  * This means that a Tx/Rx queue at index 0 will map to vector 1, and
98  * so on.
99  *
100  * NOTE: The ENA driver currently doesn't make full use of the Admin
101  * Queue interrupt. This interrupt is used both to notify the driver
102  * when a command response is ready, and when an async event is posted.
103  * The ENA driver always polls the Admin Queue for responses.
104  *
105  * Tx Queue Workings
106  * -----------------
107  *
108  * A single Tx queue (ena_txq_t) is made up of one submission queue
109  * (SQ) and its paired completion queue (CQ). These two queues form a
110  * logical descriptor ring which is used to send packets out of the
111  * device -- where each SQ entry describes the packet to be sent
112  * (enahw_tx_desc_t) and each CQ entry describes the result of sending
113  * a packet (enahw_tx_cdesc_t). For this to work the host and device
114  * must agree on which descriptors are currently owned by the host
115  * (free for sending) and which are owned by the device (pending
116  * device completion). This state is tracked on the host side via head
117  * and tail indexes along with a phase value.
118  *
119  * The head and tail values represent the head and tail of the FIFO
120  * queue of pending packets -- the next packet to be sent by the
121  * device is head, and all descriptors up to tail are ready for
122  * sending. The phase allows the host to determine which CQ
123  * descriptors represent completed events when using per-SQ completion
124  * events (as opposed to queue head pointer updates). As the queues
125  * represent a logical ring buffer, the phase must alternate on
126  * wrap-around. The device initializes the phase to zero, and the host
127  * starts with a phase of 1. The first packet descriptor writes, and
128  * their corresponding completions, are indicated with a phase of 1.
129  *
130  *
131  * For example, the diagram below represents the SQ/CQ state after the
132  * first 6 packets have been sent by the host and 2 of them have been
133  * completed by the device (and these completions have been processed
134  * by the driver). In this state the host could send 4 more packets
135  * before needing to wait on completion events.
136  *
137  *
138  *    +---+---+---+---+---+---+---+---+
139  * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |   phase = 1
140  *    +---+---+---+---+---+---+---+---+
141  *                              ^
142  *                              |
143  *                            tail
144  *            head
145  *              |
146  *              v
147  *    +---+---+---+---+---+---+---+---+
148  * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |   phase = 1
149  *    +---+---+---+---+---+---+---+---+
150  *
151  *
152  * The next diagram shows how the state changes as 5 more packets are
153  * sent (for a total of 11) and 7 more are completed (for a total of
154  * 9). Notice that as the SQ and CQ have wrapped around their phases
155  * have been complemented. In this state the host could send 6 more
156  * packets before needing to wait on completion events.
157  *
158  *    +---+---+---+---+---+---+---+---+
159  * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 |   phase = 0
160  *    +---+---+---+---+---+---+---+---+
161  *                  ^
162  *                  |
163  *                tail
164  *        head
165  *          |
166  *          v
167  *    +---+---+---+---+---+---+---+---+
168  * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |   phase = 0
169  *    +---+---+---+---+---+---+---+---+
170  *
171  *
172  * Currently, all packets are copied for Tx. At ring start we allocate
173  * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has
174  * DMA buffer associated with it; and each buffer is large enough to
175  * hold the MTU. Therefore, Tx descriptors and TCBs currently have a
176  * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to
177  * the TCB's DMA buffer, and a new descriptor is written to the SQ
178  * describing said TCB buffer. If and when we add more advanced
179  * features like DMA binding of mblks and TSO, this 1:1 guarantee will
180  * no longer hold.
181  *
182  * Rx Queue Workings
183  * -----------------
184  *
185  * In terms of implementing the logical descriptor ring, the Rx queues
186  * are very much like the Tx queues. There is a paired SQ and CQ for
187  * each logical ring. The difference is that in Rx the SQ is for
188  * handing buffers to the device to fill, and the CQ is for describing
189  * the contents of those buffers for a given received frame. At Rx
190  * ring start we allocate a Rx Control Buffer (RCB) for each
191  * descriptor in the ring. Each RCB has a DMA buffer associated with
192  * it; and each buffer is large enough to hold the MTU. For each
193  * received frame we copy the contents out of the RCB and into its own
194  * mblk, immediately returning the RCB for reuse. As with Tx, this
195  * gives us a simple 1:1 mapping currently, but if more advanced
196  * features are implemented later this could change.
197  *
198  * Asynchronous Event Notification Queue (AENQ)
199  * --------------------------------------------
200  *
201  * Each ENA device comes with a mechanism for sending out-of-band
202  * notifications to the driver. This includes events like link state
203  * changes, fatal errors, and a watchdog/keep alive signal. The AENQ
204  * delivery mechanism is via interrupt, handled by the ena_aenq_work()
205  * function, which dispatches via the eaenq_hdlrs table. If no handler
206  * is registered, the ena_aenq_default_hdlr() handler is used. A given
207  * device may not support all the different event types
208  * (enahw_aenq_groups_t); and the driver may choose to enable a subset
209  * of the supported events. During attach we call ena_aenq_configure()
210  * to negotiate the supported/enabled events. The enabled group is
211  * stored at ena_aenq_enabled_groups.
212  *
213  * Queues and Unsigned Wraparound
214  * ------------------------------
215  *
216  * All the queues use a uint16_t value as their head/tail values, e.g.
217  * the Rx queue's er_cq_head_idx value. You might notice that we only
218  * ever increment these values, letting them perform implicit unsigned
219  * integer wraparound. This is intended. This is the same behavior as
220  * the common code, and seems to be what the hardware expects. Of
221  * course, when accessing our own descriptor arrays we must make sure
222  * to first perform a modulo of this value or risk running off into
223  * space.
224  *
225  * Watchdog and Device Reset
226  * -------------------------
227  *
228  * While the device is running, the driver periodically invokes a
229  * watchdog function to check that all is well, and to reset the
230  * device if not. The device will be reset if any of the following is
231  * true:
232  *
233  *    o The device's status register fatal error bit is set. A device
234  *      in this state will no longer process any queues;
235  *    o No asynchronous event keepalives have been received for some
236  *      time -- see ENA_DEVICE_KEEPALIVE_TIMEOUT_NS;
237  *    o A Tx queue has remained blocked for some time -- see
238  *      ENA_TX_STALL_TIMEOUT;
239  *    o The device has requested, via an asynchronous event, that we
240  *      perform a reset;
241  *    o Driver code has detected an error and set the EN_STATE_ERROR
242  *      bit in ena_state.
243  *
244  * There is a "fatal error" asynchronous event, but common code does
245  * not use that as a reason to trigger a reset, and so neither do we.
246  *
247  * The global `ena_force_reset` variable can be used as a simple means
248  * to trigger a reset during driver development and testing. If there
249  * are multiple instances, it is likely that only one of them will
250  * reset when this variable is changed to `true`.
251  *
252  * Attach Sequencing
253  * -----------------
254  *
255  * Most drivers implement their attach/detach/cleanup functions as a
256  * sequential stream of function calls used to allocate and initialize
257  * resources in an order determined by the device's programming manual
258  * combined with any requirements imposed by the kernel and its
259  * relevant modules. These functions can become quite long. It is
260  * often hard to see the order in which steps are taken, and even
261  * harder to tell if detach/cleanup undoes them in the correct order,
262  * or even if it undoes them at all! The only sure way to understand
263  * the flow is to take good notes while closely inspecting each line
264  * of code. Even then, it's easy for attach and detach to get out of
265  * sync.
266  *
267  * Some more recent drivers have improved on this situation by using a
268  * bit vector to track the sequence of events in attach/detach. Each
269  * bit is declared in as an enum value, in the same order it is
270  * expected attach would run, and thus detach would run in the exact
271  * opposite order. This has three main benefits:
272  *
273  *    1. It makes it easier to determine sequence order at a
274  *       glance.
275  *
276  *    2. It gives a better idea of what state the device is in during
277  *       debugging (the sequence bit vector is kept with the instance
278  *       state).
279  *
280  *    3. The detach function can verify that all sequence bits are
281  *       cleared, indicating that everything done in attach was
282  *       successfully undone.
283  *
284  * These are great improvements. However, the attach/detach functions
285  * can still become unruly, and there is still no guarantee that
286  * detach is done in opposite order of attach (this is not always
287  * strictly required, but is probably the best way to write detach).
288  * There is still a lot of boilerplate and chance for programmer
289  * error.
290  *
291  * The ena driver takes the sequence idea a bit further, creating a
292  * descriptor table of the attach sequence (ena_attach_tbl). This
293  * table is used by attach/detach to generically, declaratively, and
294  * programmatically enforce the precise sequence order and verify that
295  * anything that is done is undone. This provides several benefits:
296  *
297  *    o Correct order is enforced implicitly by the descriptor table.
298  *      It is impossible for the detach sequence to run in any other
299  *      order other than opposite that of attach.
300  *
301  *    o It is obvious what the precise attach sequence is. While the
302  *      bit vector enum helps a lot with this it doesn't prevent
303  *      programmer error. With the sequence defined as a declarative
304  *      table it makes it easy for the programmer to see the order and
305  *      know it's followed exactly.
306  *
307  *    o It is impossible to modify the attach sequence without also
308  *      specifying a callback for its dual in the detach sequence.
309  *
310  *    o Common and repetitive code like error checking, logging, and bit
311  *      vector modification is eliminated and centralized, again
312  *      reducing the chance of programmer error.
313  *
314  * The ena attach sequence is defined under ena_attach_seq_t. The
315  * descriptor table is defined under ena_attach_tbl.
316  */
317 
318 /*
319  * These are some basic data layout invariants on which development
320  * assumptions where made.
321  */
322 CTASSERT(sizeof (enahw_tx_data_desc_t) == 16);
323 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t));
324 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t));
325 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t));
326 
327 /*
328  * Amazon does not specify the endianess of the ENA device. We assume
329  * it's the same as the bus, and we assume the CPU/bus is always
330  * little endian.
331  */
332 #ifdef _BIG_ENDIAN
333 #error "ENA driver is little-endian only"
334 #endif
335 
336 /*
337  * These values are used to communicate the driver version to the AWS
338  * hypervisor via the ena_set_host_info() function. We don't know what
339  * exactly AWS does with this info, but it's fairly safe to assume
340  * it's used solely for debug/informational purposes. The Linux driver
341  * updates these values frequently as bugs are fixed and features are
342  * added.
343  */
344 #define	ENA_DRV_VER_MAJOR	1
345 #define	ENA_DRV_VER_MINOR	0
346 #define	ENA_DRV_VER_SUBMINOR	0
347 
348 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS;
349 
350 /*
351  * Log an error message. We leave the destination (console or system
352  * log) up to the caller
353  */
354 void
355 ena_err(const ena_t *ena, const char *fmt, ...)
356 {
357 	va_list ap;
358 
359 	va_start(ap, fmt);
360 	if (ena != NULL && ena->ena_dip != NULL) {
361 		vdev_err(ena->ena_dip, CE_WARN, fmt, ap);
362 	} else {
363 		vcmn_err(CE_WARN, fmt, ap);
364 	}
365 	va_end(ap);
366 }
367 
368 void
369 ena_panic(const ena_t *ena, const char *fmt, ...)
370 {
371 	va_list ap;
372 
373 	va_start(ap, fmt);
374 	if (ena != NULL && ena->ena_dip != NULL) {
375 		vdev_err(ena->ena_dip, CE_PANIC, fmt, ap);
376 	} else {
377 		vcmn_err(CE_PANIC, fmt, ap);
378 	}
379 	va_end(ap);
380 }
381 
382 /*
383  * Set this to true to enable debug messages.
384  */
385 bool ena_debug = false;
386 
387 /*
388  * Log a debug message. We force all debug messages to go to the
389  * system log.
390  */
391 void
392 ena_dbg(const ena_t *ena, const char *fmt, ...)
393 {
394 	va_list ap;
395 
396 	if (ena_debug) {
397 		char msg[1024];
398 
399 		va_start(ap, fmt);
400 		(void) vsnprintf(msg, sizeof (msg), fmt, ap);
401 		va_end(ap);
402 
403 		if (ena != NULL && ena->ena_dip != NULL) {
404 			dev_err(ena->ena_dip, CE_NOTE, "!%s", msg);
405 		} else {
406 			cmn_err(CE_NOTE, "!%s", msg);
407 		}
408 	}
409 }
410 
411 void
412 ena_trigger_reset(ena_t *ena, enahw_reset_reason_t reason)
413 {
414 	mutex_enter(&ena->ena_lock);
415 	ena->ena_reset_reason = reason;
416 	mutex_exit(&ena->ena_lock);
417 	atomic_or_32(&ena->ena_state, ENA_STATE_ERROR);
418 }
419 
420 /*
421  * Determine if a given feature is available on this device.
422  */
423 bool
424 ena_is_feat_avail(ena_t *ena, const enahw_feature_id_t feat_id)
425 {
426 	VERIFY3U(feat_id, <=, ENAHW_FEAT_NUM);
427 	uint32_t mask = 1U << feat_id;
428 
429 	/*
430 	 * The device attributes feature is always supported, as
431 	 * indicated by the common code.
432 	 */
433 	if (feat_id == ENAHW_FEAT_DEVICE_ATTRIBUTES)
434 		return (true);
435 
436 	return ((ena->ena_supported_features & mask) != 0);
437 }
438 
439 /*
440  * Determine if a given capability is available on this device.
441  */
442 bool
443 ena_is_cap_avail(ena_t *ena, const enahw_capability_id_t cap_id)
444 {
445 	VERIFY3U(cap_id, <=, ENAHW_CAP_NUM);
446 	uint32_t mask = 1U << cap_id;
447 
448 	return ((ena->ena_capabilities & mask) != 0);
449 }
450 
451 static bool
452 ena_device_reset(ena_t *ena, enum enahw_reset_reason_types reason)
453 {
454 	uint32_t rval, wval, reason_lsb, reason_msb;
455 	hrtime_t timeout, expired;
456 
457 	rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
458 	if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) {
459 		ena_err(ena, "reset: device is not ready");
460 		return (false);
461 	}
462 
463 	rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
464 
465 	/*
466 	 * The device stores the reset timeout at 100ms resolution; we
467 	 * normalize that to nanoseconds.
468 	 */
469 	timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100);
470 
471 	if (timeout == 0) {
472 		ena_err(ena, "device gave invalid (0) reset timeout");
473 		return (false);
474 	}
475 
476 	expired = gethrtime() + timeout;
477 
478 	wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
479 
480 	reason_lsb = ENAHW_RESET_REASON_LSB(reason);
481 	reason_msb = ENAHW_RESET_REASON_MSB(reason);
482 
483 	wval |= (reason_lsb << ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
484 	    ENAHW_DEV_CTL_RESET_REASON_MASK;
485 	if (ena_is_cap_avail(ena, ENAHW_CAP_EXTENDED_RESET_REASONS)) {
486 		wval |= (reason_msb << ENAHW_DEV_CTL_RESET_REASON_EXT_SHIFT) &
487 		    ENAHW_DEV_CTL_RESET_REASON_EXT_MASK;
488 	} else if (reason_msb != 0) {
489 		/* Fall back to "generic" which we know will fit */
490 		wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
491 		wval |= (ENAHW_RESET_GENERIC <<
492 		    ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
493 		    ENAHW_DEV_CTL_RESET_REASON_MASK;
494 	}
495 
496 	ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval);
497 
498 	/*
499 	 * Make sure reset is in progress.
500 	 */
501 	for (;;) {
502 		rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
503 
504 		if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0)
505 			break;
506 
507 		if (gethrtime() > expired) {
508 			ena_err(ena, "device reset start timed out");
509 			return (false);
510 		}
511 
512 		/* Sleep for 100 milliseconds. */
513 		delay(drv_usectohz(100 * 1000));
514 	}
515 
516 	/*
517 	 * Reset the timeout counter for the next device request.
518 	 */
519 	expired = gethrtime() + timeout;
520 
521 	/*
522 	 * Wait for the device reset to finish.
523 	 */
524 	ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0);
525 	for (;;) {
526 		rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
527 
528 		if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) {
529 			break;
530 		}
531 
532 		if (gethrtime() > expired) {
533 			ena_err(ena, "device reset timed out");
534 			return (false);
535 		}
536 
537 		/* Sleep for 100 milliseconds. */
538 		delay(drv_usectohz(100 * 1000));
539 	}
540 
541 	ena_dbg(ena, "device reset succeeded");
542 
543 	return (true);
544 }
545 
546 static bool
547 ena_attach_pci(ena_t *ena)
548 {
549 	ddi_acc_handle_t hdl;
550 
551 	if (pci_config_setup(ena->ena_dip, &hdl) != 0) {
552 		return (false);
553 	}
554 
555 	ena->ena_pci_hdl = hdl;
556 	ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID);
557 	ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID);
558 	ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID);
559 	ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID);
560 	ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID);
561 	ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x",
562 	    ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev,
563 	    ena->ena_pci_svid, ena->ena_pci_sdid);
564 
565 	return (true);
566 }
567 
568 static void
569 ena_cleanup_pci(ena_t *ena, bool resetting)
570 {
571 	VERIFY0(resetting);
572 	pci_config_teardown(&ena->ena_pci_hdl);
573 }
574 
575 static void
576 ena_cleanup_regs_map(ena_t *ena, bool resetting)
577 {
578 	VERIFY0(resetting);
579 	ddi_regs_map_free(&ena->ena_reg_hdl);
580 }
581 
582 static bool
583 ena_attach_regs_map(ena_t *ena)
584 {
585 	int ret = 0;
586 
587 	if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) !=
588 	    DDI_SUCCESS) {
589 		ena_err(ena, "failed to get register set %d size",
590 		    ENA_REG_NUMBER);
591 		return (false);
592 	}
593 
594 	ena_dbg(ena, "register size: %ld", ena->ena_reg_size);
595 	bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr));
596 	ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1;
597 	ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
598 	ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
599 
600 	/*
601 	 * This function can return several different failure values,
602 	 * so we make sure to capture its return value for the purpose
603 	 * of logging.
604 	 */
605 	ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER,
606 	    &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr,
607 	    &ena->ena_reg_hdl);
608 
609 	if (ret != DDI_SUCCESS) {
610 		ena_err(ena, "failed to map register set %d: %d",
611 		    ENA_REG_NUMBER, ret);
612 		return (false);
613 	}
614 
615 	ena_dbg(ena, "registers mapped to base: 0x%p",
616 	    (void *)ena->ena_reg_base);
617 
618 	return (true);
619 }
620 
621 /*
622  * Free any resources related to the admin submission queue.
623  */
624 static void
625 ena_admin_sq_free(ena_t *ena)
626 {
627 	ena_dma_free(&ena->ena_aq.ea_sq.eas_dma);
628 }
629 
630 /*
631  * Initialize the admin submission queue.
632  */
633 static bool
634 ena_admin_sq_init(ena_t *ena)
635 {
636 	ena_adminq_t *aq = &ena->ena_aq;
637 	ena_dma_buf_t *dma = &aq->ea_sq.eas_dma;
638 	size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries);
639 	uint32_t addr_low, addr_high, wval;
640 
641 	if (aq->ea_sq.eas_entries == NULL) {
642 		ena_dma_conf_t conf = {
643 			.edc_size = size,
644 			.edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT,
645 			.edc_sgl = 1,
646 			.edc_endian = DDI_NEVERSWAP_ACC,
647 			.edc_stream = false,
648 		};
649 
650 		if (!ena_dma_alloc(ena, dma, &conf, size)) {
651 			ena_err(ena, "failed to allocate DMA for Admin SQ");
652 			return (false);
653 		}
654 
655 		ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
656 		aq->ea_sq.eas_entries = (void *)dma->edb_va;
657 	} else {
658 		ena_dma_bzero(dma);
659 	}
660 
661 	aq->ea_sq.eas_tail = 0;
662 	aq->ea_sq.eas_phase = 1;
663 	aq->ea_sq.eas_dbaddr =
664 	    (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB);
665 	addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
666 	addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
667 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low);
668 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high);
669 	wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) |
670 	    ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries));
671 	ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval);
672 
673 	return (true);
674 }
675 
676 /*
677  * Free any resources related to the admin completion queue.
678  */
679 static void
680 ena_admin_cq_free(ena_t *ena)
681 {
682 	ena_dma_free(&ena->ena_aq.ea_cq.eac_dma);
683 }
684 
685 /*
686  * Initialize the admin completion queue.
687  */
688 static bool
689 ena_admin_cq_init(ena_t *ena)
690 {
691 	ena_adminq_t *aq = &ena->ena_aq;
692 	ena_dma_buf_t *dma = &aq->ea_cq.eac_dma;
693 	uint32_t addr_low, addr_high, wval;
694 
695 	if (aq->ea_cq.eac_entries == NULL) {
696 		size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries);
697 		ena_dma_conf_t conf = {
698 			.edc_size = size,
699 			.edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT,
700 			.edc_sgl = 1,
701 			.edc_endian = DDI_NEVERSWAP_ACC,
702 			.edc_stream = false,
703 		};
704 
705 		if (!ena_dma_alloc(ena, dma, &conf, size)) {
706 			ena_err(ena, "failed to allocate DMA for Admin CQ");
707 			return (false);
708 		}
709 
710 		ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
711 		aq->ea_cq.eac_entries = (void *)dma->edb_va;
712 	} else {
713 		ena_dma_bzero(dma);
714 	}
715 
716 	aq->ea_cq.eac_head = 0;
717 	aq->ea_cq.eac_phase = 1;
718 	addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
719 	addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
720 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low);
721 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high);
722 	wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) |
723 	    ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries));
724 	ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval);
725 
726 	return (true);
727 }
728 
729 void
730 ena_update_hints(ena_t *ena, enahw_device_hints_t *hints)
731 {
732 	ena->ena_device_hints.eh_mmio_read_timeout =
733 	    hints->edh_mmio_read_timeout;
734 	ena->ena_device_hints.eh_keep_alive_timeout =
735 	    hints->edh_keep_alive_timeout;
736 	ena->ena_device_hints.eh_tx_comp_timeout = hints->edh_tx_comp_timeout;
737 	ena->ena_device_hints.eh_missed_tx_reset_threshold =
738 	    hints->edh_missed_tx_reset_threshold;
739 	ena->ena_device_hints.eh_admin_comp_timeout =
740 	    hints->edh_admin_comp_timeout;
741 	ena->ena_device_hints.eh_max_tx_sgl = hints->edh_max_tx_sgl;
742 	ena->ena_device_hints.eh_max_rx_sgl = hints->edh_max_rx_sgl;
743 }
744 
745 /*
746  * We limit the max number of I/O queues based on several aspects of
747  * the underlying hardware.
748  *
749  * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES,
750  *    which comes from the common code and presumably is based on device
751  *    constraints.
752  *
753  * 2. Next we latch the number of I/O queues to the number of online
754  *    CPUs. The idea being that each queue is a parallel work stream,
755  *    and having more queues than CPUs to flush them will not improve
756  *    performance. The number of online CPUs can change dynamically,
757  *    and that's okay, everything should still work fine, it just
758  *    might not be ideal.
759  *
760  * 3. Next we latch the number of I/O queues to the smallest of the
761  *    max Tx queues and max Rx queues. We could probably loosen this
762  *    restriction in the future, and have separate max I/O queues for
763  *    Tx and Rx. This is what Linux does, and seems like a fine place
764  *    to start.
765  */
766 static void
767 ena_set_max_io_queues(ena_t *ena)
768 {
769 	uint32_t max = ENAHW_MAX_NUM_IO_QUEUES;
770 
771 	max = MIN(ncpus_online, max);
772 	/*
773 	 * Supposedly a device could present a different number of SQs
774 	 * and CQs. This driver is designed in a way that requires
775 	 * each SQ to have a corresponding and dedicated CQ (how would
776 	 * it work otherwise). Therefore, we must check both values
777 	 * and find the minimum between them.
778 	 */
779 	max = MIN(ena->ena_tx_max_sq_num, max);
780 	max = MIN(ena->ena_tx_max_cq_num, max);
781 	max = MIN(ena->ena_rx_max_sq_num, max);
782 	max = MIN(ena->ena_rx_max_cq_num, max);
783 
784 
785 	/* This shouldn't happen, but just in case. */
786 	if (max == 0) {
787 		max = 1;
788 	}
789 
790 	ena->ena_max_io_queues = max;
791 }
792 
793 /*
794  * We require that an Rx or Tx buffer be able to hold the maximum MTU
795  * along with the maximum frame header length. In this case we know
796  * ENA is presenting us an Ethernet frame so we add the size of an
797  * Ethernet VLAN header. Rx has the additional requirement of needing
798  * additional margin for the sake of IP header alignment.
799  */
800 static void
801 ena_update_buf_sizes(ena_t *ena)
802 {
803 	ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header);
804 	ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu;
805 	ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total,
806 	    ena->ena_page_sz, uint32_t);
807 	ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total +
808 	    ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t);
809 }
810 
811 static bool
812 ena_get_hints(ena_t *ena)
813 {
814 	int ret;
815 	enahw_resp_desc_t resp;
816 	enahw_device_hints_t *hints = &resp.erd_resp.erd_get_feat.ergf_hints;
817 
818 	ena_dbg(ena, "Requesting hints");
819 
820 	bzero(&resp, sizeof (resp));
821 	ret = ena_get_feature(ena, &resp, ENAHW_FEAT_HW_HINTS,
822 	    ENAHW_FEAT_HW_HINTS_VER);
823 
824 	if (ret == ENOTSUP) {
825 		/* In this case the device does not support querying hints */
826 		ena_dbg(ena, "Hints are unsupported");
827 		return (true);
828 	} else if (ret != 0) {
829 		ena_err(ena, "Error getting hints: %d", ret);
830 		return (false);
831 	}
832 
833 	ena_update_hints(ena, hints);
834 
835 	return (true);
836 }
837 
838 static bool
839 ena_get_offloads(ena_t *ena)
840 {
841 	int ret = 0;
842 	enahw_resp_desc_t resp;
843 	enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload;
844 
845 	ena->ena_tx_l3_ipv4_csum = false;
846 
847 	ena->ena_tx_l4_ipv4_part_csum = false;
848 	ena->ena_tx_l4_ipv4_full_csum = false;
849 	ena->ena_tx_l4_ipv4_lso = false;
850 
851 	ena->ena_tx_l4_ipv6_part_csum = false;
852 	ena->ena_tx_l4_ipv6_full_csum = false;
853 	ena->ena_tx_l4_ipv6_lso = false;
854 
855 	ena->ena_rx_l3_ipv4_csum = false;
856 	ena->ena_rx_l4_ipv4_csum = false;
857 	ena->ena_rx_l4_ipv6_csum = false;
858 	ena->ena_rx_hash = false;
859 
860 	bzero(&resp, sizeof (resp));
861 	ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG,
862 	    ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER);
863 
864 	if (ret == ENOTSUP) {
865 		/*
866 		 * In this case the device does not support querying
867 		 * for hardware offloads. We take that as a sign that
868 		 * the device provides no offloads.
869 		 */
870 		return (true);
871 	} else if (ret != 0) {
872 		ena_err(ena, "error getting stateless offload: %d", ret);
873 		return (false);
874 	}
875 
876 	ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat);
877 
878 	ena->ena_tx_l4_ipv4_part_csum =
879 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat);
880 	ena->ena_tx_l4_ipv4_full_csum =
881 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat);
882 	ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat);
883 
884 	ena->ena_tx_l4_ipv6_part_csum =
885 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat);
886 	ena->ena_tx_l4_ipv6_full_csum =
887 	    ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat);
888 	ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat);
889 
890 	ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat);
891 	ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat);
892 	ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat);
893 	return (true);
894 }
895 
896 static int
897 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval,
898     const int defval)
899 {
900 	int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip,
901 	    DDI_PROP_DONTPASS, propname, defval);
902 
903 	if (value > maxval) {
904 		ena_err(ena, "user value %s=%d exceeded maximum, setting to %d",
905 		    propname, value, maxval);
906 		value = maxval;
907 	}
908 
909 	if (value < minval) {
910 		ena_err(ena, "user value %s=%d below minimum, setting to %d",
911 		    propname, value, minval);
912 		value = minval;
913 	}
914 
915 	return (value);
916 }
917 
918 static bool
919 ena_set_mtu(ena_t *ena)
920 {
921 	int ret = 0;
922 	enahw_cmd_desc_t cmd;
923 	enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu;
924 	enahw_resp_desc_t resp;
925 
926 	bzero(&cmd, sizeof (cmd));
927 	bzero(&resp, sizeof (resp));
928 	feat->efm_mtu = ena->ena_mtu;
929 
930 	if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU,
931 	    ENAHW_FEAT_MTU_VER)) != 0) {
932 		ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu,
933 		    ret);
934 		return (false);
935 	}
936 
937 	return (true);
938 }
939 
940 static void
941 ena_get_link_config(ena_t *ena)
942 {
943 	enahw_resp_desc_t resp;
944 	enahw_feat_link_conf_t *feat =
945 	    &resp.erd_resp.erd_get_feat.ergf_link_conf;
946 	bool full_duplex;
947 
948 	bzero(&resp, sizeof (resp));
949 
950 	if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG,
951 	    ENAHW_FEAT_LINK_CONFIG_VER) != 0) {
952 		/*
953 		 * Some ENA devices do no support this feature. In
954 		 * those cases we report a 1Gbps link, full duplex.
955 		 * For the most accurate information on bandwidth
956 		 * limits see the official AWS documentation.
957 		 */
958 		ena->ena_link_speed_mbits = 1000;
959 		ena->ena_link_speeds = ENAHW_LINK_SPEED_1G;
960 		ena->ena_link_duplex = LINK_DUPLEX_FULL;
961 		ena->ena_link_autoneg = true;
962 		return;
963 	}
964 
965 	ena->ena_link_speed_mbits = feat->eflc_speed;
966 	ena->ena_link_speeds = feat->eflc_supported;
967 	full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat);
968 	ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL :
969 	    LINK_DUPLEX_HALF;
970 	ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat);
971 }
972 
973 /*
974  * Retrieve all configuration values which are modifiable via
975  * ena.conf, and set ena_t members accordingly. While the conf values
976  * have priority, they may be implicitly modified by the driver to
977  * meet resource constraints on a given platform. If no value is
978  * specified in the conf file, the driver will attempt to use the
979  * largest value supported. While there should be no value large
980  * enough, keep in mind that ena_get_prop() will cast the values to an
981  * int.
982  *
983  * This function should be called after the device is initialized,
984  * admin queue is established, and the hardware features/capabs have
985  * been queried; it should be called before mac registration.
986  */
987 static bool
988 ena_attach_read_conf(ena_t *ena)
989 {
990 	uint32_t gcv;	/* Greatest Common Value */
991 
992 	/*
993 	 * We expect that the queue lengths are the same for both the
994 	 * CQ and SQ, but technically the device could return
995 	 * different lengths. For now the driver locks them together.
996 	 */
997 	gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs);
998 	ASSERT3U(gcv, <=, INT_MAX);
999 	ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS,
1000 	    ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv);
1001 
1002 	ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT,
1003 	    ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX,
1004 	    ENA_PROP_RXQ_INTR_LIMIT_DEF);
1005 
1006 	gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs);
1007 	ASSERT3U(gcv, <=, INT_MAX);
1008 	ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS,
1009 	    ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv);
1010 
1011 	return (true);
1012 }
1013 
1014 /*
1015  * Perform any necessary device configuration after the driver.conf
1016  * has been read.
1017  */
1018 static bool
1019 ena_attach_dev_cfg(ena_t *ena)
1020 {
1021 	ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF);
1022 
1023 	if (!ena_set_mtu(ena)) {
1024 		/*
1025 		 * We don't expect this to fail, but we try a fallback
1026 		 * first before failing the attach sequence.
1027 		 */
1028 		ena->ena_mtu = 1500;
1029 		ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu);
1030 
1031 		if (!ena_set_mtu(ena)) {
1032 			return (false);
1033 		}
1034 	}
1035 
1036 	return (true);
1037 }
1038 
1039 static bool
1040 ena_check_versions(ena_t *ena)
1041 {
1042 	uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION);
1043 	uint32_t ctrl_vsn =
1044 	    ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION);
1045 
1046 	ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn);
1047 	ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn);
1048 
1049 	ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn);
1050 	ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn);
1051 	ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn);
1052 	ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn);
1053 
1054 	ena_dbg(ena, "device version: %u.%u",
1055 	    ena->ena_dev_major_vsn, ena->ena_dev_minor_vsn);
1056 	ena_dbg(ena, "controller version: %u.%u.%u implementation %u",
1057 	    ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1058 	    ena->ena_ctrl_subminor_vsn, ena->ena_ctrl_impl_id);
1059 
1060 	if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) {
1061 		ena_err(ena, "unsupported controller version: %u.%u.%u",
1062 		    ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1063 		    ena->ena_ctrl_subminor_vsn);
1064 		return (false);
1065 	}
1066 
1067 	return (true);
1068 }
1069 
1070 static bool
1071 ena_adminq_init(ena_t *ena)
1072 {
1073 	ena_adminq_t *aq = &ena->ena_aq;
1074 
1075 	/*
1076 	 * As we are not using an interrupt for admin queue completion
1077 	 * signaling, we do not need a priority on these mutexes. If
1078 	 * that changes, we will have to rejigger some code to create
1079 	 * the admin queue interrupt before this function.
1080 	 */
1081 	mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL);
1082 	mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL);
1083 	mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL);
1084 	aq->ea_qlen = ENA_ADMINQ_DEPTH;
1085 	aq->ea_pending_cmds = 0;
1086 
1087 	aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen,
1088 	    KM_SLEEP);
1089 	list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t),
1090 	    offsetof(ena_cmd_ctx_t, ectx_node));
1091 	list_create(&aq->ea_cmd_ctxs_used, sizeof (ena_cmd_ctx_t),
1092 	    offsetof(ena_cmd_ctx_t, ectx_node));
1093 
1094 	ena_create_cmd_ctx(ena);
1095 
1096 	/*
1097 	 * Start in polling mode until we've determined the number of queues
1098 	 * and are ready to configure and enable interrupts.
1099 	 */
1100 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK);
1101 	aq->ea_poll_mode = true;
1102 
1103 	return (true);
1104 }
1105 
1106 /*
1107  * Free all resources allocated as part of ena_device_init().
1108  */
1109 static void
1110 ena_cleanup_device_init(ena_t *ena, bool resetting)
1111 {
1112 	ena_adminq_t *aq = &ena->ena_aq;
1113 
1114 	VERIFY0(resetting);
1115 
1116 	ena_free_host_info(ena);
1117 	mutex_destroy(&aq->ea_sq_lock);
1118 	mutex_destroy(&aq->ea_cq_lock);
1119 	mutex_destroy(&aq->ea_stat_lock);
1120 	list_destroy(&aq->ea_cmd_ctxs_free);
1121 	list_destroy(&aq->ea_cmd_ctxs_used);
1122 	kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen);
1123 	ena_admin_sq_free(ena);
1124 	ena_admin_cq_free(ena);
1125 	ena_aenq_free(ena);
1126 	ena_stat_device_cleanup(ena);
1127 	ena_stat_device_basic_cleanup(ena);
1128 	ena_stat_device_extended_cleanup(ena);
1129 	ena_stat_aenq_cleanup(ena);
1130 }
1131 
1132 static bool
1133 ena_attach_device_init(ena_t *ena)
1134 {
1135 	ena_adminq_t *aq = &ena->ena_aq;
1136 	uint32_t rval;
1137 	uint8_t dma_width;
1138 	hrtime_t cmd_timeout;
1139 	enahw_resp_desc_t resp;
1140 	enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr;
1141 	uint8_t *maddr;
1142 	uint32_t supported_features;
1143 	int ret = 0;
1144 
1145 	ena->ena_reset_reason = ENAHW_RESET_NORMAL;
1146 	if (!ena_device_reset(ena, ena->ena_reset_reason))
1147 		return (false);
1148 
1149 	if (!ena_check_versions(ena))
1150 		return (false);
1151 
1152 	ena_init_regcache(ena);
1153 
1154 	rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1155 	dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval);
1156 	ena->ena_dma_width = dma_width;
1157 
1158 	/*
1159 	 * The value stored in the device register is in the
1160 	 * resolution of 100 milliseconds. We normalize that to
1161 	 * nanoseconds.
1162 	 */
1163 	cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100);
1164 	aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns);
1165 
1166 	if (aq->ea_cmd_timeout_ns == 0)
1167 		aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS;
1168 
1169 	if (!ena_adminq_init(ena))
1170 		return (false);
1171 
1172 	if (!ena_admin_sq_init(ena))
1173 		return (false);
1174 
1175 	if (!ena_admin_cq_init(ena))
1176 		return (false);
1177 
1178 	if (!ena_aenq_init(ena))
1179 		return (false);
1180 
1181 	bzero(&resp, sizeof (resp));
1182 	ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES,
1183 	    ENAHW_FEAT_DEVICE_ATTRIBUTES_VER);
1184 
1185 	if (ret != 0) {
1186 		ena_err(ena, "failed to get device attributes: %d", ret);
1187 		return (false);
1188 	}
1189 
1190 	ena_dbg(ena, "impl ID: %u", feat->efda_impl_id);
1191 	ena_dbg(ena, "device version: %u", feat->efda_device_version);
1192 	ena_dbg(ena, "supported features: 0x%x",
1193 	    feat->efda_supported_features);
1194 	ena_dbg(ena, "device capabilities: 0x%x", feat->efda_capabilities);
1195 	ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width);
1196 	ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with);
1197 	maddr = feat->efda_mac_addr;
1198 	ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1],
1199 	    maddr[2], maddr[3], maddr[4], maddr[5]);
1200 	ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu);
1201 
1202 	bcopy(maddr, ena->ena_mac_addr, ETHERADDRL);
1203 	ena->ena_max_mtu = feat->efda_max_mtu;
1204 	ena->ena_capabilities = feat->efda_capabilities;
1205 	supported_features = feat->efda_supported_features;
1206 	ena->ena_supported_features = supported_features;
1207 	feat = NULL;
1208 	bzero(&resp, sizeof (resp));
1209 
1210 	if (ena_is_feat_avail(ena, ENAHW_FEAT_MAX_QUEUES_EXT)) {
1211 		enahw_feat_max_queue_ext_t *feat_mqe =
1212 		    &resp.erd_resp.erd_get_feat.ergf_max_queue_ext;
1213 
1214 		ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT,
1215 		    ENAHW_FEAT_MAX_QUEUES_EXT_VER);
1216 
1217 		if (ret != 0) {
1218 			ena_err(ena, "failed to query max queues ext: %d", ret);
1219 			return (false);
1220 		}
1221 
1222 		ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num;
1223 		ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth;
1224 		ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num;
1225 		ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth;
1226 		ena->ena_tx_max_desc_per_pkt =
1227 		    feat_mqe->efmqe_max_per_packet_tx_descs;
1228 		ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size;
1229 
1230 		ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num;
1231 		ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth;
1232 		ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num;
1233 		ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth;
1234 		ena->ena_rx_max_desc_per_pkt =
1235 		    feat_mqe->efmqe_max_per_packet_rx_descs;
1236 
1237 		ena_set_max_io_queues(ena);
1238 	} else {
1239 		enahw_feat_max_queue_t *feat_mq =
1240 		    &resp.erd_resp.erd_get_feat.ergf_max_queue;
1241 
1242 		ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM,
1243 		    ENAHW_FEAT_MAX_QUEUES_NUM_VER);
1244 
1245 		if (ret != 0) {
1246 			ena_err(ena, "failed to query max queues: %d", ret);
1247 			return (false);
1248 		}
1249 
1250 		ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num;
1251 		ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1252 		ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num;
1253 		ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1254 		ena->ena_tx_max_desc_per_pkt =
1255 		    feat_mq->efmq_max_per_packet_tx_descs;
1256 		ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size;
1257 
1258 		ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num;
1259 		ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1260 		ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num;
1261 		ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1262 		ena->ena_rx_max_desc_per_pkt =
1263 		    feat_mq->efmq_max_per_packet_rx_descs;
1264 
1265 		ena_set_max_io_queues(ena);
1266 	}
1267 
1268 	ena->ena_mtu = ena->ena_max_mtu;
1269 	ena_update_buf_sizes(ena);
1270 
1271 	if (!ena_get_hints(ena))
1272 		return (false);
1273 
1274 	ena->ena_tx_sgl_max_sz = 1;
1275 	ena->ena_rx_sgl_max_sz = 1;
1276 	if (ena->ena_device_hints.eh_max_tx_sgl != 0)
1277 		ena->ena_tx_sgl_max_sz = ena->ena_device_hints.eh_max_tx_sgl;
1278 	if (ena->ena_device_hints.eh_max_rx_sgl != 0)
1279 		ena->ena_rx_sgl_max_sz = ena->ena_device_hints.eh_max_rx_sgl;
1280 
1281 	if (!ena_init_host_info(ena))
1282 		return (false);
1283 
1284 	if (!ena_aenq_configure(ena))
1285 		return (false);
1286 
1287 	ena_get_link_config(ena);
1288 
1289 	if (!ena_get_offloads(ena))
1290 		return (false);
1291 
1292 	if (!ena_stat_device_init(ena))
1293 		return (false);
1294 
1295 	if (!ena_stat_device_basic_init(ena))
1296 		return (false);
1297 
1298 	if (!ena_stat_device_extended_init(ena))
1299 		return (false);
1300 
1301 	if (!ena_stat_aenq_init(ena))
1302 		return (false);
1303 
1304 	ena_update_regcache(ena);
1305 
1306 	return (true);
1307 }
1308 
1309 static void
1310 ena_cleanup_intr_alloc(ena_t *ena, bool resetting)
1311 {
1312 	VERIFY0(resetting);
1313 
1314 	for (int i = 0; i < ena->ena_num_intrs; i++) {
1315 		int ret = ddi_intr_free(ena->ena_intr_handles[i]);
1316 		if (ret != DDI_SUCCESS) {
1317 			ena_err(ena, "failed to free interrupt %d: %d", i, ret);
1318 		}
1319 	}
1320 
1321 	if (ena->ena_intr_handles != NULL) {
1322 		kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz);
1323 		ena->ena_intr_handles = NULL;
1324 		ena->ena_intr_handles_sz = 0;
1325 	}
1326 }
1327 
1328 /*
1329  * The Linux driver supports only MSI-X interrupts. We do the same,
1330  * with the assumption that it's the only type of interrupt the device
1331  * can present.
1332  */
1333 static bool
1334 ena_attach_intr_alloc(ena_t *ena)
1335 {
1336 	int ret;
1337 	int types;
1338 	int min, req, ideal, avail, actual;
1339 
1340 	ret = ddi_intr_get_supported_types(ena->ena_dip, &types);
1341 	if (ret != DDI_SUCCESS) {
1342 		ena_err(ena, "failed to get interrupt types: %d", ret);
1343 		return (false);
1344 	}
1345 
1346 	ena_dbg(ena, "supported interrupt types: 0x%x", types);
1347 	if ((types & DDI_INTR_TYPE_MSIX) == 0) {
1348 		ena_err(ena, "the ena driver only supports MSI-X interrupts");
1349 		return (false);
1350 	}
1351 
1352 	/* One for I/O, one for adminq. */
1353 	min = 2;
1354 	ideal = ena->ena_max_io_queues + 1;
1355 	ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1356 	if (ret != DDI_SUCCESS) {
1357 		ena_err(ena, "failed to get number of MSI-X interrupts: %d",
1358 		    ret);
1359 		return (false);
1360 	}
1361 
1362 	if (avail < min) {
1363 		ena_err(ena, "number of MSI-X interrupts is %d, but the driver "
1364 		    "requires a minimum of %d", avail, min);
1365 		return (false);
1366 	}
1367 
1368 	ena_dbg(ena, "%d MSI-X interrupts available", avail);
1369 
1370 	ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1371 	if (ret != DDI_SUCCESS) {
1372 		ena_err(ena, "failed to get available interrupts: %d", ret);
1373 		return (false);
1374 	}
1375 
1376 	if (avail < min) {
1377 		ena_err(ena, "number of available MSI-X interrupts is %d, "
1378 		    "but the driver requires a minimum of %d", avail, min);
1379 		return (false);
1380 	}
1381 
1382 	req = MIN(ideal, avail);
1383 	ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t);
1384 	ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP);
1385 
1386 	ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles,
1387 	    DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL);
1388 	if (ret != DDI_SUCCESS) {
1389 		ena_err(ena, "failed to allocate %d MSI-X interrupts: %d",
1390 		    req, ret);
1391 		return (false);
1392 	}
1393 
1394 	if (actual < min) {
1395 		ena_err(ena, "number of allocated interrupts is %d, but the "
1396 		    "driver requires a minimum of %d", actual, min);
1397 		return (false);
1398 	}
1399 
1400 	ena->ena_num_intrs = actual;
1401 
1402 	ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps);
1403 	if (ret != DDI_SUCCESS) {
1404 		ena_err(ena, "failed to get interrupt capability: %d", ret);
1405 		return (false);
1406 	}
1407 
1408 	ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri);
1409 	if (ret != DDI_SUCCESS) {
1410 		ena_err(ena, "failed to get interrupt priority: %d", ret);
1411 		return (false);
1412 	}
1413 
1414 	ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u",
1415 	    actual, ena->ena_intr_caps, ena->ena_intr_pri);
1416 
1417 	/*
1418 	 * The ena_lock should not be held in the data path, but it is
1419 	 * held as part of the AENQ handler, which runs in interrupt
1420 	 * context. Therefore, we delayed the initialization of this
1421 	 * mutex until after the interrupts are allocated.
1422 	 */
1423 	mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER,
1424 	    DDI_INTR_PRI(ena->ena_intr_pri));
1425 	mutex_init(&ena->ena_watchdog_lock, NULL, MUTEX_DRIVER, NULL);
1426 
1427 	return (true);
1428 }
1429 
1430 /*
1431  * Allocate the parent Rx queue structures. More importantly, this is
1432  * NOT allocating the queue descriptors or data buffers. Those are
1433  * allocated on demand as queues are started.
1434  */
1435 static bool
1436 ena_attach_alloc_rxqs(ena_t *ena)
1437 {
1438 	bool resetting = false;
1439 
1440 	if (ena->ena_rxqs == NULL) {
1441 		/*
1442 		 * We rely on the interrupt priority for initializing the
1443 		 * mutexes.
1444 		 */
1445 		VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1446 		ena->ena_num_rxqs = ena->ena_num_intrs - 1;
1447 		ASSERT3U(ena->ena_num_rxqs, >, 0);
1448 		ena->ena_rxqs = kmem_zalloc(
1449 		    ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), KM_SLEEP);
1450 	} else {
1451 		resetting = true;
1452 	}
1453 
1454 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1455 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1456 
1457 		rxq->er_rxqs_idx = i;
1458 		/* The 0th vector is for Admin + AENQ. */
1459 		rxq->er_intr_vector = i + 1;
1460 		rxq->er_mrh = NULL;
1461 
1462 		if (!resetting) {
1463 			mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER,
1464 			    DDI_INTR_PRI(ena->ena_intr_pri));
1465 			mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER,
1466 			    DDI_INTR_PRI(ena->ena_intr_pri));
1467 		}
1468 
1469 		rxq->er_ena = ena;
1470 		rxq->er_sq_num_descs = ena->ena_rxq_num_descs;
1471 		rxq->er_cq_num_descs = ena->ena_rxq_num_descs;
1472 
1473 		if (!ena_stat_rxq_init(rxq)) {
1474 			return (false);
1475 		}
1476 
1477 		if (!ena_alloc_rxq(rxq)) {
1478 			ena_stat_rxq_cleanup(rxq);
1479 			return (false);
1480 		}
1481 	}
1482 
1483 	return (true);
1484 }
1485 
1486 static void
1487 ena_cleanup_rxqs(ena_t *ena, bool resetting)
1488 {
1489 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1490 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1491 
1492 		ena_cleanup_rxq(rxq, resetting);
1493 		if (!resetting) {
1494 			mutex_destroy(&rxq->er_lock);
1495 			mutex_destroy(&rxq->er_stat_lock);
1496 		}
1497 		ena_stat_rxq_cleanup(rxq);
1498 	}
1499 
1500 	if (!resetting) {
1501 		kmem_free(ena->ena_rxqs,
1502 		    ena->ena_num_rxqs * sizeof (*ena->ena_rxqs));
1503 		ena->ena_rxqs = NULL;
1504 	}
1505 }
1506 
1507 /*
1508  * Allocate the parent Tx queue structures. More importantly, this is
1509  * NOT allocating the queue descriptors or data buffers. Those are
1510  * allocated on demand as a queue is started.
1511  */
1512 static bool
1513 ena_attach_alloc_txqs(ena_t *ena)
1514 {
1515 	bool resetting = false;
1516 
1517 	if (ena->ena_txqs == NULL) {
1518 		/*
1519 		 * We rely on the interrupt priority for initializing the
1520 		 * mutexes.
1521 		 */
1522 		VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1523 		ena->ena_num_txqs = ena->ena_num_intrs - 1;
1524 		ASSERT3U(ena->ena_num_txqs, >, 0);
1525 		ena->ena_txqs = kmem_zalloc(
1526 		    ena->ena_num_txqs * sizeof (*ena->ena_txqs), KM_SLEEP);
1527 	} else {
1528 		resetting = true;
1529 	}
1530 
1531 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1532 		ena_txq_t *txq = &ena->ena_txqs[i];
1533 
1534 		txq->et_txqs_idx = i;
1535 		/* The 0th vector is for Admin + AENQ. */
1536 		txq->et_intr_vector = i + 1;
1537 		txq->et_mrh = NULL;
1538 
1539 		if (!resetting) {
1540 			mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER,
1541 			    DDI_INTR_PRI(ena->ena_intr_pri));
1542 			mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER,
1543 			    DDI_INTR_PRI(ena->ena_intr_pri));
1544 		}
1545 
1546 		txq->et_ena = ena;
1547 		txq->et_sq_num_descs = ena->ena_txq_num_descs;
1548 		txq->et_cq_num_descs = ena->ena_txq_num_descs;
1549 
1550 		if (!ena_stat_txq_init(txq)) {
1551 			return (false);
1552 		}
1553 
1554 		if (!ena_alloc_txq(txq)) {
1555 			ena_stat_txq_cleanup(txq);
1556 			return (false);
1557 		}
1558 	}
1559 
1560 	return (true);
1561 }
1562 
1563 static void
1564 ena_cleanup_txqs(ena_t *ena, bool resetting)
1565 {
1566 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1567 		ena_txq_t *txq = &ena->ena_txqs[i];
1568 
1569 		ena_cleanup_txq(txq, resetting);
1570 		if (!resetting) {
1571 			mutex_destroy(&txq->et_lock);
1572 			mutex_destroy(&txq->et_stat_lock);
1573 		}
1574 		ena_stat_txq_cleanup(txq);
1575 	}
1576 
1577 	if (!resetting) {
1578 		kmem_free(ena->ena_txqs,
1579 		    ena->ena_num_txqs * sizeof (*ena->ena_txqs));
1580 		ena->ena_txqs = NULL;
1581 	}
1582 }
1583 
1584 /*
1585  * To reset the device we need to unwind some of the steps taken during attach
1586  * but, since the device could well be in a failed state, we cannot rely on
1587  * being able to talk via the admin queue to do things such as explicitly
1588  * destroy rings. We call selected cleanup handlers with the second parameter
1589  * set to "true" to indicate that we are resetting and should avoid such
1590  * communication.
1591  *
1592  * The existing DMA memory regions for the admin queue, async event queue and
1593  * host information are preserved but have their contents zeroed.
1594  * Experimentation has shown that the device hangs onto old async event queue
1595  * addresses, even through a reset, with surprising results if the addresses
1596  * happen to change.
1597  *
1598  * We clean up all of the Tx and Rx ring descriptors and the TCBs but leave the
1599  * allocated memory for the ring data and mutexes intact. Pointers to this
1600  * memory have already been provided to MAC, and the mutexes keep the rings
1601  * locked until we're ready to start them again.
1602  *
1603  * To ensure that other driver activity is excluded, we hold the mutexes on the
1604  * Tx and Rx rings throughout, and unset the `ENA_STATE_STARTED` bit in the
1605  * state, which causes the interrupt handlers to return without doing any work.
1606  * The admin interrupt, used for notifications of admin completions or new
1607  * asynchronous events, is masked after the device is reset until we're ready
1608  * to process them again.
1609  */
1610 bool
1611 ena_reset(ena_t *ena, const enahw_reset_reason_t reason)
1612 {
1613 	ena_txq_state_t tx_state[ena->ena_num_txqs];
1614 	ena_rxq_state_t rx_state[ena->ena_num_rxqs];
1615 	bool ret = false;
1616 
1617 	ena_err(ena, "resetting device with reason 0x%x [%s]",
1618 	    reason, enahw_reset_reason(reason));
1619 
1620 	VERIFY0(ena->ena_state & ENA_STATE_RESETTING);
1621 	atomic_or_32(&ena->ena_state, ENA_STATE_RESETTING);
1622 
1623 	VERIFY(ena->ena_state & ENA_STATE_STARTED);
1624 	atomic_and_32(&ena->ena_state, ~ENA_STATE_STARTED);
1625 
1626 	mutex_enter(&ena->ena_lock);
1627 
1628 	ena_update_regcache(ena);
1629 
1630 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1631 		ena_txq_t *txq = &ena->ena_txqs[i];
1632 
1633 		mutex_enter(&txq->et_lock);
1634 		tx_state[i] = txq->et_state;
1635 		if (txq->et_state & ENA_TXQ_STATE_RUNNING)
1636 			ena_ring_tx_stop((mac_ring_driver_t)txq);
1637 	}
1638 
1639 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1640 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1641 
1642 		mutex_enter(&rxq->er_lock);
1643 		rx_state[i] = rxq->er_state;
1644 		if (rxq->er_state & ENA_RXQ_STATE_RUNNING)
1645 			ena_ring_rx_stop((mac_ring_driver_t)rxq);
1646 	}
1647 
1648 	if (!ena_device_reset(ena, reason)) {
1649 		ena_err(ena, "reset: failed to reset device");
1650 		goto out;
1651 	}
1652 
1653 	/* This masks the admin/aenq interrupt */
1654 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK);
1655 
1656 	ena_cleanup_txqs(ena, true);
1657 	ena_cleanup_rxqs(ena, true);
1658 
1659 	ena_release_all_cmd_ctx(ena);
1660 
1661 	if (!ena_admin_cq_init(ena) || !ena_admin_sq_init(ena)) {
1662 		ena_err(ena, "reset: failed to program admin queues");
1663 		goto out;
1664 	}
1665 
1666 	if (!ena_init_host_info(ena)) {
1667 		ena_err(ena, "reset: failed to set host info");
1668 		goto out;
1669 	}
1670 
1671 	if (!ena_aenq_init(ena) || !ena_aenq_configure(ena)) {
1672 		ena_err(ena, "reset: failed to configure aenq");
1673 		goto out;
1674 	}
1675 
1676 	if (!ena_set_mtu(ena)) {
1677 		ena_err(ena, "reset: failed to set MTU");
1678 		goto out;
1679 	}
1680 
1681 	if (!ena_attach_alloc_txqs(ena) || !ena_attach_alloc_rxqs(ena)) {
1682 		ena_err(ena, "reset: failed to program IO queues");
1683 		goto out;
1684 	}
1685 
1686 	ena_aenq_enable(ena);
1687 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK);
1688 
1689 	for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1690 		ena_rxq_t *rxq = &ena->ena_rxqs[i];
1691 
1692 		mutex_exit(&rxq->er_lock);
1693 		if (rx_state[i] & ENA_RXQ_STATE_RUNNING) {
1694 			(void) ena_ring_rx_start((mac_ring_driver_t)rxq,
1695 			    rxq->er_m_gen_num);
1696 		}
1697 	}
1698 
1699 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1700 		ena_txq_t *txq = &ena->ena_txqs[i];
1701 
1702 		mutex_exit(&txq->et_lock);
1703 		if (tx_state[i] & ENA_TXQ_STATE_RUNNING) {
1704 			(void) ena_ring_tx_start((mac_ring_driver_t)txq,
1705 			    txq->et_m_gen_num);
1706 		}
1707 	}
1708 
1709 	atomic_or_32(&ena->ena_state, ENA_STATE_STARTED);
1710 	ret = true;
1711 
1712 out:
1713 	atomic_and_32(&ena->ena_state, ~ENA_STATE_RESETTING);
1714 	mutex_exit(&ena->ena_lock);
1715 
1716 	ena_update_regcache(ena);
1717 
1718 	return (ret);
1719 }
1720 
1721 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = {
1722 	{
1723 		.ead_seq = ENA_ATTACH_PCI,
1724 		.ead_name = "PCI config",
1725 		.ead_attach_fn = ena_attach_pci,
1726 		.ead_attach_hard_fail = true,
1727 		.ead_cleanup_fn = ena_cleanup_pci,
1728 	},
1729 
1730 	{
1731 		.ead_seq = ENA_ATTACH_REGS,
1732 		.ead_name = "BAR mapping",
1733 		.ead_attach_fn = ena_attach_regs_map,
1734 		.ead_attach_hard_fail = true,
1735 		.ead_cleanup_fn = ena_cleanup_regs_map,
1736 	},
1737 
1738 	{
1739 		.ead_seq = ENA_ATTACH_DEV_INIT,
1740 		.ead_name = "device initialization",
1741 		.ead_attach_fn = ena_attach_device_init,
1742 		.ead_attach_hard_fail = true,
1743 		.ead_cleanup_fn = ena_cleanup_device_init,
1744 	},
1745 
1746 	{
1747 		.ead_seq = ENA_ATTACH_READ_CONF,
1748 		.ead_name = "ena.conf",
1749 		.ead_attach_fn = ena_attach_read_conf,
1750 		.ead_attach_hard_fail = true,
1751 		.ead_cleanup_fn = NULL,
1752 	},
1753 
1754 	{
1755 		.ead_seq = ENA_ATTACH_DEV_CFG,
1756 		.ead_name = "device config",
1757 		.ead_attach_fn = ena_attach_dev_cfg,
1758 		.ead_attach_hard_fail = true,
1759 		.ead_cleanup_fn = NULL,
1760 	},
1761 
1762 	{
1763 		.ead_seq = ENA_ATTACH_INTR_ALLOC,
1764 		.ead_name = "interrupt allocation",
1765 		.ead_attach_fn = ena_attach_intr_alloc,
1766 		.ead_attach_hard_fail = true,
1767 		.ead_cleanup_fn = ena_cleanup_intr_alloc,
1768 	},
1769 
1770 	{
1771 		.ead_seq = ENA_ATTACH_INTR_HDLRS,
1772 		.ead_name = "interrupt handlers",
1773 		.ead_attach_fn = ena_intr_add_handlers,
1774 		.ead_attach_hard_fail = true,
1775 		.ead_cleanup_fn = ena_intr_remove_handlers,
1776 	},
1777 
1778 	{
1779 		.ead_seq = ENA_ATTACH_TXQS_ALLOC,
1780 		.ead_name = "Tx queues",
1781 		.ead_attach_fn = ena_attach_alloc_txqs,
1782 		.ead_attach_hard_fail = true,
1783 		.ead_cleanup_fn = ena_cleanup_txqs,
1784 	},
1785 
1786 	{
1787 		.ead_seq = ENA_ATTACH_RXQS_ALLOC,
1788 		.ead_name = "Rx queues",
1789 		.ead_attach_fn = ena_attach_alloc_rxqs,
1790 		.ead_attach_hard_fail = true,
1791 		.ead_cleanup_fn = ena_cleanup_rxqs,
1792 	},
1793 
1794 	/*
1795 	 * The chance of mac_unregister() failure poses a problem to
1796 	 * cleanup. We address interrupt disablement and mac
1797 	 * unregistration explicitly in the attach/detach routines.
1798 	 */
1799 	{
1800 		.ead_seq = ENA_ATTACH_MAC_REGISTER,
1801 		.ead_name = "mac registration",
1802 		.ead_attach_fn = ena_mac_register,
1803 		.ead_attach_hard_fail = true,
1804 		.ead_cleanup_fn = NULL,
1805 	},
1806 
1807 	{
1808 		.ead_seq = ENA_ATTACH_INTRS_ENABLE,
1809 		.ead_name = "enable interrupts",
1810 		.ead_attach_fn = ena_intrs_enable,
1811 		.ead_attach_hard_fail = true,
1812 		.ead_cleanup_fn = NULL,
1813 	}
1814 };
1815 
1816 /*
1817  * This function undoes any work done by ena_attach(), either in
1818  * response to a failed attach or a planned detach. At the end of this
1819  * function ena_attach_seq should be zero, otherwise it means
1820  * something has not be freed/uninitialized.
1821  */
1822 static void
1823 ena_cleanup(ena_t *ena)
1824 {
1825 	if (ena == NULL || ena->ena_attach_seq == 0) {
1826 		return;
1827 	}
1828 
1829 	/*
1830 	 * We VERIFY this because if the seq is greater than entries
1831 	 * we drift into space and execute god knows what.
1832 	 */
1833 	VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES);
1834 
1835 	while (ena->ena_attach_seq > 0) {
1836 		int idx = ena->ena_attach_seq - 1;
1837 		ena_attach_desc_t *desc = &ena_attach_tbl[idx];
1838 
1839 		ena_dbg(ena, "running cleanup sequence: %s (%d)",
1840 		    desc->ead_name, idx);
1841 
1842 		if (desc->ead_cleanup_fn != NULL)
1843 			desc->ead_cleanup_fn(ena, false);
1844 		ena->ena_attach_seq--;
1845 	}
1846 
1847 	ASSERT3U(ena->ena_attach_seq, ==, 0);
1848 	mutex_destroy(&ena->ena_lock);
1849 	mutex_destroy(&ena->ena_watchdog_lock);
1850 }
1851 
1852 static int
1853 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1854 {
1855 	ena_t *ena;
1856 
1857 	if (cmd != DDI_ATTACH) {
1858 		return (DDI_FAILURE);
1859 	}
1860 
1861 	ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP);
1862 	ena->ena_instance = ddi_get_instance(dip);
1863 	ena->ena_dip = dip;
1864 	ena->ena_instance = ddi_get_instance(dip);
1865 	ena->ena_page_sz = ddi_ptob(dip, 1);
1866 
1867 	for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) {
1868 		bool success;
1869 		ena_attach_desc_t *desc = &ena_attach_tbl[i];
1870 
1871 		ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name,
1872 		    i);
1873 
1874 		if (!(success = desc->ead_attach_fn(ena))) {
1875 			ena_err(ena, "attach sequence failed: %s (%d)",
1876 			    desc->ead_name, i);
1877 
1878 			if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) {
1879 				/*
1880 				 * In this specific case
1881 				 * ENA_ATTACH_INTRS_ENABLE has failed,
1882 				 * and we may or may not be able to
1883 				 * unregister the mac, depending on if
1884 				 * something in userspace has created
1885 				 * a client on top.
1886 				 *
1887 				 * NOTE: Something that would be nice
1888 				 * to add to mac is the ability to
1889 				 * register a provider separate from
1890 				 * "publishing" it to the rest of the
1891 				 * system. This would allow a driver
1892 				 * to register its mac, do some
1893 				 * additional work that might fail,
1894 				 * and then unregister if that work
1895 				 * fails without concern for any
1896 				 * chance of failure when calling
1897 				 * unregister. This would remove the
1898 				 * complexity of the situation we are
1899 				 * trying to address here, as we would
1900 				 * know that until the mac has been
1901 				 * "published", there is no chance for
1902 				 * mac_unregister() to fail.
1903 				 */
1904 				if (ena_mac_unregister(ena) != 0) {
1905 					return (DDI_FAILURE);
1906 				}
1907 
1908 				ena->ena_attach_seq--;
1909 			} else {
1910 				/*
1911 				 * Since the ead_seq is predicated on
1912 				 * successful ead_attach_fn we must
1913 				 * run the specific cleanup handler
1914 				 * before calling the global cleanup
1915 				 * routine. This also means that all
1916 				 * cleanup functions must be able to
1917 				 * deal with partial success of the
1918 				 * corresponding ead_attach_fn.
1919 				 */
1920 				if (desc->ead_cleanup_fn != NULL)
1921 					desc->ead_cleanup_fn(ena, false);
1922 			}
1923 
1924 			ena_cleanup(ena);
1925 			kmem_free(ena, sizeof (ena_t));
1926 			return (DDI_FAILURE);
1927 		}
1928 
1929 		if (success) {
1930 			ena_dbg(ena, "attach sequence completed: %s (%d)",
1931 			    desc->ead_name, i);
1932 		}
1933 
1934 		ena->ena_attach_seq = desc->ead_seq;
1935 	}
1936 
1937 	/*
1938 	 * Now that interrupts are enabled, unmask the admin interrupt.
1939 	 * Note that this interrupt is generated for both the admin queue and
1940 	 * the AENQ, but this driver always polls the admin queue. The surplus
1941 	 * interrupt for admin command completion triggers a harmless check of
1942 	 * the AENQ.
1943 	 */
1944 	ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK);
1945 	ena_aenq_enable(ena);
1946 
1947 	ddi_set_driver_private(dip, ena);
1948 
1949 	ena_update_regcache(ena);
1950 
1951 	atomic_or_32(&ena->ena_state, ENA_STATE_INITIALIZED);
1952 
1953 	return (DDI_SUCCESS);
1954 }
1955 
1956 static int
1957 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1958 {
1959 	ena_t *ena = ddi_get_driver_private(dip);
1960 
1961 	if (ena == NULL) {
1962 		return (DDI_FAILURE);
1963 	}
1964 
1965 	/*
1966 	 * Before we can proceed to cleanup we have to treat
1967 	 * mac_unregister() explicitly -- if there are still
1968 	 * outstanding clients, then we can't proceed with detach or
1969 	 * cleanup.
1970 	 */
1971 
1972 	/*
1973 	 * Why this would fail I don't know, but if we proceed to mac
1974 	 * unregister, then there is a good chance we will panic in
1975 	 * the Rx interrupt handler when calling mac_rx_ring()
1976 	 */
1977 	if (!ena_intrs_disable(ena)) {
1978 		return (DDI_FAILURE);
1979 	}
1980 
1981 	/* We can't detach if clients are actively using the device. */
1982 	if (ena_mac_unregister(ena) != 0) {
1983 		(void) ena_intrs_enable(ena);
1984 		return (DDI_FAILURE);
1985 	}
1986 
1987 	/*
1988 	 * At this point we can proceed with the rest of cleanup on a
1989 	 * best-effort basis.
1990 	 */
1991 	ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC;
1992 	ena_cleanup(ena);
1993 	ddi_set_driver_private(dip, NULL);
1994 	kmem_free(ena, sizeof (ena_t));
1995 	return (DDI_SUCCESS);
1996 }
1997 
1998 static struct cb_ops ena_cb_ops = {
1999 	.cb_open = nodev,
2000 	.cb_close = nodev,
2001 	.cb_strategy = nodev,
2002 	.cb_print = nodev,
2003 	.cb_dump = nodev,
2004 	.cb_read = nodev,
2005 	.cb_write = nodev,
2006 	.cb_ioctl = nodev,
2007 	.cb_devmap = nodev,
2008 	.cb_mmap = nodev,
2009 	.cb_segmap = nodev,
2010 	.cb_chpoll = nochpoll,
2011 	.cb_prop_op = ddi_prop_op,
2012 	.cb_flag = D_MP,
2013 	.cb_rev = CB_REV,
2014 	.cb_aread = nodev,
2015 	.cb_awrite = nodev
2016 };
2017 
2018 static struct dev_ops ena_dev_ops = {
2019 	.devo_rev = DEVO_REV,
2020 	.devo_refcnt = 0,
2021 	.devo_getinfo = NULL,
2022 	.devo_identify = nulldev,
2023 	.devo_probe = nulldev,
2024 	.devo_attach = ena_attach,
2025 	.devo_detach = ena_detach,
2026 	.devo_reset = nodev,
2027 	.devo_quiesce = ddi_quiesce_not_supported,
2028 	.devo_cb_ops = &ena_cb_ops
2029 };
2030 
2031 static struct modldrv ena_modldrv = {
2032 	.drv_modops = &mod_driverops,
2033 	.drv_linkinfo = "AWS ENA Ethernet",
2034 	.drv_dev_ops = &ena_dev_ops
2035 };
2036 
2037 static struct modlinkage ena_modlinkage = {
2038 	.ml_rev = MODREV_1,
2039 	.ml_linkage = { &ena_modldrv, NULL }
2040 };
2041 
2042 int
2043 _init(void)
2044 {
2045 	int ret;
2046 
2047 	mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME);
2048 
2049 	if ((ret = mod_install(&ena_modlinkage)) != 0) {
2050 		mac_fini_ops(&ena_dev_ops);
2051 		return (ret);
2052 	}
2053 
2054 	return (ret);
2055 }
2056 
2057 int
2058 _info(struct modinfo *modinfop)
2059 {
2060 	return (mod_info(&ena_modlinkage, modinfop));
2061 }
2062 
2063 int
2064 _fini(void)
2065 {
2066 	int ret;
2067 
2068 	if ((ret = mod_remove(&ena_modlinkage)) != 0) {
2069 		return (ret);
2070 	}
2071 
2072 	mac_fini_ops(&ena_dev_ops);
2073 	return (ret);
2074 }
2075