1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 #include "ena_hw.h"
17 #include "ena.h"
18
19 /*
20 * Elastic Network Adapter (ENA) Driver
21 * ------------------------------------
22 *
23 * The ena driver provides support for the AWS ENA device, also
24 * referred to as their "enhanced networking". This device is present
25 * on "Nitro"-based instances. It presents itself with the following
26 * PCI Vendor/Device IDs
27 *
28 * o 1d0f:0ec2 -- ENA PF
29 * o 1d0f:1ec2 -- ENA PF (Reserved)
30 * o 1d0f:ec20 -- ENA VF
31 * o 1d0f:ec21 -- ENA VF (Reserved)
32 *
33 * This driver provides support for only the essential features needed
34 * to drive traffic on an ENA device. Support for the following
35 * features IS NOT currently implemented.
36 *
37 * o Admin Queue Interrupts: queue completion events are always polled
38 * o FMA
39 * o Rx checksum offloads
40 * o Tx checksum offloads
41 * o Tx DMA bind (borrow buffers)
42 * o Rx DMA bind (loaned buffers)
43 * o TSO
44 * o RSS
45 * o Low Latency Queues (LLQ)
46 * o Support for different Tx completion policies
47 * o More controlled Tx recycling and Rx refill
48 *
49 * Even without these features the ena driver should perform
50 * reasonably well.
51 *
52 * Driver vs. Hardware Types
53 * -------------------------
54 *
55 * To properly communicate with the ENA device the driver must
56 * populate memory (registers and buffers) with specific types. These
57 * types are defined by the device and are found under the "common"
58 * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have
59 * simplified this a bit by defining all device-specific types in the
60 * ena_hw.h file. Furthermore, all device-specific types are given an
61 * "enahw" prefix. This makes it clear when we are dealing with a
62 * device type and when we are dealing with a driver type.
63 *
64 * [1]: https://github.com/amzn/amzn-drivers
65 *
66 * Groups, Rings (Queues), and Interrupts
67 * --------------------------------------
68 *
69 * The ENA device presents one mac group. This single mac group
70 * represents the single unicast address that this device represents
71 * in your AWS instance. The ENA device presents no option for
72 * configuring additional MAC addresses, multicast, or promisc mode --
73 * you receive only what AWS wants you to receive.
74 *
75 * This single mac group may have one or more rings. The ENA driver
76 * refers to rings as queues, for no special reason other than it was
77 * the dominant language in the Linux and FreeBSD drivers, and it
78 * spilled over into this port. The upper bound on number of queues is
79 * presented by the device. However, we don't just go with whatever
80 * number of queues the device reports; but rather we limit the queues
81 * based on other factors such as an absolute maximum, number of
82 * online CPUs, and number of available interrupts. The upper bound is
83 * calculated by ena_set_max_io_queues(), and that is used and
84 * possibly further restricted in ena_attach_intr_alloc(). As this
85 * point, ultimately, it is the number of available interrupts (minus
86 * one for the admin queue) that determines the number of queues: one
87 * Tx and one Rx on each I/O interrupt.
88 *
89 * NOTE: Perhaps it is overly restrictive to limit the number of
90 * queues to the number of I/O interrupts. Something worth considering
91 * on larger instances if they present far less interrupts than they
92 * do queues + CPUs.
93 *
94 * The ENA device presents MSI-X interrupts only. During attach the
95 * driver queries the number of available interrupts and sets aside
96 * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N).
97 * This means that a Tx/Rx queue at index 0 will map to vector 1, and
98 * so on.
99 *
100 * NOTE: The ENA driver currently doesn't make full use of the Admin
101 * Queue interrupt. This interrupt is used both to notify the driver
102 * when a command response is ready, and when an async event is posted.
103 * The ENA driver always polls the Admin Queue for responses.
104 *
105 * Tx Queue Workings
106 * -----------------
107 *
108 * A single Tx queue (ena_txq_t) is made up of one submission queue
109 * (SQ) and its paired completion queue (CQ). These two queues form a
110 * logical descriptor ring which is used to send packets out of the
111 * device -- where each SQ entry describes the packet to be sent
112 * (enahw_tx_desc_t) and each CQ entry describes the result of sending
113 * a packet (enahw_tx_cdesc_t). For this to work the host and device
114 * must agree on which descriptors are currently owned by the host
115 * (free for sending) and which are owned by the device (pending
116 * device completion). This state is tracked on the host side via head
117 * and tail indexes along with a phase value.
118 *
119 * The head and tail values represent the head and tail of the FIFO
120 * queue of pending packets -- the next packet to be sent by the
121 * device is head, and all descriptors up to tail are ready for
122 * sending. The phase allows the host to determine which CQ
123 * descriptors represent completed events when using per-SQ completion
124 * events (as opposed to queue head pointer updates). As the queues
125 * represent a logical ring buffer, the phase must alternate on
126 * wrap-around. The device initializes the phase to zero, and the host
127 * starts with a phase of 1. The first packet descriptor writes, and
128 * their corresponding completions, are indicated with a phase of 1.
129 *
130 *
131 * For example, the diagram below represents the SQ/CQ state after the
132 * first 6 packets have been sent by the host and 2 of them have been
133 * completed by the device (and these completions have been processed
134 * by the driver). In this state the host could send 4 more packets
135 * before needing to wait on completion events.
136 *
137 *
138 * +---+---+---+---+---+---+---+---+
139 * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1
140 * +---+---+---+---+---+---+---+---+
141 * ^
142 * |
143 * tail
144 * head
145 * |
146 * v
147 * +---+---+---+---+---+---+---+---+
148 * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1
149 * +---+---+---+---+---+---+---+---+
150 *
151 *
152 * The next diagram shows how the state changes as 5 more packets are
153 * sent (for a total of 11) and 7 more are completed (for a total of
154 * 9). Notice that as the SQ and CQ have wrapped around their phases
155 * have been complemented. In this state the host could send 6 more
156 * packets before needing to wait on completion events.
157 *
158 * +---+---+---+---+---+---+---+---+
159 * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0
160 * +---+---+---+---+---+---+---+---+
161 * ^
162 * |
163 * tail
164 * head
165 * |
166 * v
167 * +---+---+---+---+---+---+---+---+
168 * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0
169 * +---+---+---+---+---+---+---+---+
170 *
171 *
172 * Currently, all packets are copied for Tx. At ring start we allocate
173 * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has
174 * DMA buffer associated with it; and each buffer is large enough to
175 * hold the MTU. Therefore, Tx descriptors and TCBs currently have a
176 * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to
177 * the TCB's DMA buffer, and a new descriptor is written to the SQ
178 * describing said TCB buffer. If and when we add more advanced
179 * features like DMA binding of mblks and TSO, this 1:1 guarantee will
180 * no longer hold.
181 *
182 * Rx Queue Workings
183 * -----------------
184 *
185 * In terms of implementing the logical descriptor ring, the Rx queues
186 * are very much like the Tx queues. There is a paired SQ and CQ for
187 * each logical ring. The difference is that in Rx the SQ is for
188 * handing buffers to the device to fill, and the CQ is for describing
189 * the contents of those buffers for a given received frame. At Rx
190 * ring start we allocate a Rx Control Buffer (RCB) for each
191 * descriptor in the ring. Each RCB has a DMA buffer associated with
192 * it; and each buffer is large enough to hold the MTU. For each
193 * received frame we copy the contents out of the RCB and into its own
194 * mblk, immediately returning the RCB for reuse. As with Tx, this
195 * gives us a simple 1:1 mapping currently, but if more advanced
196 * features are implemented later this could change.
197 *
198 * Asynchronous Event Notification Queue (AENQ)
199 * --------------------------------------------
200 *
201 * Each ENA device comes with a mechanism for sending out-of-band
202 * notifications to the driver. This includes events like link state
203 * changes, fatal errors, and a watchdog/keep alive signal. The AENQ
204 * delivery mechanism is via interrupt, handled by the ena_aenq_work()
205 * function, which dispatches via the eaenq_hdlrs table. If no handler
206 * is registered, the ena_aenq_default_hdlr() handler is used. A given
207 * device may not support all the different event types
208 * (enahw_aenq_groups_t); and the driver may choose to enable a subset
209 * of the supported events. During attach we call ena_aenq_configure()
210 * to negotiate the supported/enabled events. The enabled group is
211 * stored at ena_aenq_enabled_groups.
212 *
213 * Queues and Unsigned Wraparound
214 * ------------------------------
215 *
216 * All the queues use a uint16_t value as their head/tail values, e.g.
217 * the Rx queue's er_cq_head_idx value. You might notice that we only
218 * ever increment these values, letting them perform implicit unsigned
219 * integer wraparound. This is intended. This is the same behavior as
220 * the common code, and seems to be what the hardware expects. Of
221 * course, when accessing our own descriptor arrays we must make sure
222 * to first perform a modulo of this value or risk running off into
223 * space.
224 *
225 * Watchdog and Device Reset
226 * -------------------------
227 *
228 * While the device is running, the driver periodically invokes a
229 * watchdog function to check that all is well, and to reset the
230 * device if not. The device will be reset if any of the following is
231 * true:
232 *
233 * o The device's status register fatal error bit is set. A device
234 * in this state will no longer process any queues;
235 * o No asynchronous event keepalives have been received for some
236 * time -- see ENA_DEVICE_KEEPALIVE_TIMEOUT_NS;
237 * o A Tx queue has remained blocked for some time -- see
238 * ENA_TX_STALL_TIMEOUT;
239 * o The device has requested, via an asynchronous event, that we
240 * perform a reset;
241 * o Driver code has detected an error and set the EN_STATE_ERROR
242 * bit in ena_state.
243 *
244 * There is a "fatal error" asynchronous event, but common code does
245 * not use that as a reason to trigger a reset, and so neither do we.
246 *
247 * The global `ena_force_reset` variable can be used as a simple means
248 * to trigger a reset during driver development and testing. If there
249 * are multiple instances, it is likely that only one of them will
250 * reset when this variable is changed to `true`.
251 *
252 * Attach Sequencing
253 * -----------------
254 *
255 * Most drivers implement their attach/detach/cleanup functions as a
256 * sequential stream of function calls used to allocate and initialize
257 * resources in an order determined by the device's programming manual
258 * combined with any requirements imposed by the kernel and its
259 * relevant modules. These functions can become quite long. It is
260 * often hard to see the order in which steps are taken, and even
261 * harder to tell if detach/cleanup undoes them in the correct order,
262 * or even if it undoes them at all! The only sure way to understand
263 * the flow is to take good notes while closely inspecting each line
264 * of code. Even then, it's easy for attach and detach to get out of
265 * sync.
266 *
267 * Some more recent drivers have improved on this situation by using a
268 * bit vector to track the sequence of events in attach/detach. Each
269 * bit is declared in as an enum value, in the same order it is
270 * expected attach would run, and thus detach would run in the exact
271 * opposite order. This has three main benefits:
272 *
273 * 1. It makes it easier to determine sequence order at a
274 * glance.
275 *
276 * 2. It gives a better idea of what state the device is in during
277 * debugging (the sequence bit vector is kept with the instance
278 * state).
279 *
280 * 3. The detach function can verify that all sequence bits are
281 * cleared, indicating that everything done in attach was
282 * successfully undone.
283 *
284 * These are great improvements. However, the attach/detach functions
285 * can still become unruly, and there is still no guarantee that
286 * detach is done in opposite order of attach (this is not always
287 * strictly required, but is probably the best way to write detach).
288 * There is still a lot of boilerplate and chance for programmer
289 * error.
290 *
291 * The ena driver takes the sequence idea a bit further, creating a
292 * descriptor table of the attach sequence (ena_attach_tbl). This
293 * table is used by attach/detach to generically, declaratively, and
294 * programmatically enforce the precise sequence order and verify that
295 * anything that is done is undone. This provides several benefits:
296 *
297 * o Correct order is enforced implicitly by the descriptor table.
298 * It is impossible for the detach sequence to run in any other
299 * order other than opposite that of attach.
300 *
301 * o It is obvious what the precise attach sequence is. While the
302 * bit vector enum helps a lot with this it doesn't prevent
303 * programmer error. With the sequence defined as a declarative
304 * table it makes it easy for the programmer to see the order and
305 * know it's followed exactly.
306 *
307 * o It is impossible to modify the attach sequence without also
308 * specifying a callback for its dual in the detach sequence.
309 *
310 * o Common and repetitive code like error checking, logging, and bit
311 * vector modification is eliminated and centralized, again
312 * reducing the chance of programmer error.
313 *
314 * The ena attach sequence is defined under ena_attach_seq_t. The
315 * descriptor table is defined under ena_attach_tbl.
316 */
317
318 /*
319 * These are some basic data layout invariants on which development
320 * assumptions where made.
321 */
322 CTASSERT(sizeof (enahw_tx_data_desc_t) == 16);
323 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t));
324 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t));
325 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t));
326
327 /*
328 * Amazon does not specify the endianess of the ENA device. We assume
329 * it's the same as the bus, and we assume the CPU/bus is always
330 * little endian.
331 */
332 #ifdef _BIG_ENDIAN
333 #error "ENA driver is little-endian only"
334 #endif
335
336 /*
337 * These values are used to communicate the driver version to the AWS
338 * hypervisor via the ena_set_host_info() function. We don't know what
339 * exactly AWS does with this info, but it's fairly safe to assume
340 * it's used solely for debug/informational purposes. The Linux driver
341 * updates these values frequently as bugs are fixed and features are
342 * added.
343 */
344 #define ENA_DRV_VER_MAJOR 1
345 #define ENA_DRV_VER_MINOR 0
346 #define ENA_DRV_VER_SUBMINOR 0
347
348 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS;
349
350 /*
351 * Log an error message. We leave the destination (console or system
352 * log) up to the caller
353 */
354 void
ena_err(const ena_t * ena,const char * fmt,...)355 ena_err(const ena_t *ena, const char *fmt, ...)
356 {
357 va_list ap;
358
359 va_start(ap, fmt);
360 if (ena != NULL && ena->ena_dip != NULL) {
361 vdev_err(ena->ena_dip, CE_WARN, fmt, ap);
362 } else {
363 vcmn_err(CE_WARN, fmt, ap);
364 }
365 va_end(ap);
366 }
367
368 void
ena_panic(const ena_t * ena,const char * fmt,...)369 ena_panic(const ena_t *ena, const char *fmt, ...)
370 {
371 va_list ap;
372
373 va_start(ap, fmt);
374 if (ena != NULL && ena->ena_dip != NULL) {
375 vdev_err(ena->ena_dip, CE_PANIC, fmt, ap);
376 } else {
377 vcmn_err(CE_PANIC, fmt, ap);
378 }
379 va_end(ap);
380 }
381
382 /*
383 * Set this to true to enable debug messages.
384 */
385 bool ena_debug = false;
386
387 /*
388 * Log a debug message. We force all debug messages to go to the
389 * system log.
390 */
391 void
ena_dbg(const ena_t * ena,const char * fmt,...)392 ena_dbg(const ena_t *ena, const char *fmt, ...)
393 {
394 va_list ap;
395
396 if (ena_debug) {
397 char msg[1024];
398
399 va_start(ap, fmt);
400 (void) vsnprintf(msg, sizeof (msg), fmt, ap);
401 va_end(ap);
402
403 if (ena != NULL && ena->ena_dip != NULL) {
404 dev_err(ena->ena_dip, CE_NOTE, "!%s", msg);
405 } else {
406 cmn_err(CE_NOTE, "!%s", msg);
407 }
408 }
409 }
410
411 void
ena_trigger_reset(ena_t * ena,enahw_reset_reason_t reason)412 ena_trigger_reset(ena_t *ena, enahw_reset_reason_t reason)
413 {
414 mutex_enter(&ena->ena_lock);
415 ena->ena_reset_reason = reason;
416 mutex_exit(&ena->ena_lock);
417 atomic_or_32(&ena->ena_state, ENA_STATE_ERROR);
418 }
419
420 /*
421 * Determine if a given feature is available on this device.
422 */
423 bool
ena_is_feat_avail(ena_t * ena,const enahw_feature_id_t feat_id)424 ena_is_feat_avail(ena_t *ena, const enahw_feature_id_t feat_id)
425 {
426 VERIFY3U(feat_id, <=, ENAHW_FEAT_NUM);
427 uint32_t mask = 1U << feat_id;
428
429 /*
430 * The device attributes feature is always supported, as
431 * indicated by the common code.
432 */
433 if (feat_id == ENAHW_FEAT_DEVICE_ATTRIBUTES)
434 return (true);
435
436 return ((ena->ena_supported_features & mask) != 0);
437 }
438
439 /*
440 * Determine if a given capability is available on this device.
441 */
442 bool
ena_is_cap_avail(ena_t * ena,const enahw_capability_id_t cap_id)443 ena_is_cap_avail(ena_t *ena, const enahw_capability_id_t cap_id)
444 {
445 VERIFY3U(cap_id, <=, ENAHW_CAP_NUM);
446 uint32_t mask = 1U << cap_id;
447
448 return ((ena->ena_capabilities & mask) != 0);
449 }
450
451 static bool
ena_device_reset(ena_t * ena,enum enahw_reset_reason_types reason)452 ena_device_reset(ena_t *ena, enum enahw_reset_reason_types reason)
453 {
454 uint32_t rval, wval, reason_lsb, reason_msb;
455 hrtime_t timeout, expired;
456
457 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
458 if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) {
459 ena_err(ena, "reset: device is not ready");
460 return (false);
461 }
462
463 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
464
465 /*
466 * The device stores the reset timeout at 100ms resolution; we
467 * normalize that to nanoseconds.
468 */
469 timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100);
470
471 if (timeout == 0) {
472 ena_err(ena, "device gave invalid (0) reset timeout");
473 return (false);
474 }
475
476 expired = gethrtime() + timeout;
477
478 wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
479
480 reason_lsb = ENAHW_RESET_REASON_LSB(reason);
481 reason_msb = ENAHW_RESET_REASON_MSB(reason);
482
483 wval |= (reason_lsb << ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
484 ENAHW_DEV_CTL_RESET_REASON_MASK;
485 if (ena_is_cap_avail(ena, ENAHW_CAP_EXTENDED_RESET_REASONS)) {
486 wval |= (reason_msb << ENAHW_DEV_CTL_RESET_REASON_EXT_SHIFT) &
487 ENAHW_DEV_CTL_RESET_REASON_EXT_MASK;
488 } else if (reason_msb != 0) {
489 /* Fall back to "generic" which we know will fit */
490 wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
491 wval |= (ENAHW_RESET_GENERIC <<
492 ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
493 ENAHW_DEV_CTL_RESET_REASON_MASK;
494 }
495
496 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval);
497
498 /*
499 * Make sure reset is in progress.
500 */
501 for (;;) {
502 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
503
504 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0)
505 break;
506
507 if (gethrtime() > expired) {
508 ena_err(ena, "device reset start timed out");
509 return (false);
510 }
511
512 /* Sleep for 100 milliseconds. */
513 delay(drv_usectohz(100 * 1000));
514 }
515
516 /*
517 * Reset the timeout counter for the next device request.
518 */
519 expired = gethrtime() + timeout;
520
521 /*
522 * Wait for the device reset to finish.
523 */
524 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0);
525 for (;;) {
526 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
527
528 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) {
529 break;
530 }
531
532 if (gethrtime() > expired) {
533 ena_err(ena, "device reset timed out");
534 return (false);
535 }
536
537 /* Sleep for 100 milliseconds. */
538 delay(drv_usectohz(100 * 1000));
539 }
540
541 ena_dbg(ena, "device reset succeeded");
542
543 return (true);
544 }
545
546 static bool
ena_attach_pci(ena_t * ena)547 ena_attach_pci(ena_t *ena)
548 {
549 ddi_acc_handle_t hdl;
550
551 if (pci_config_setup(ena->ena_dip, &hdl) != 0) {
552 return (false);
553 }
554
555 ena->ena_pci_hdl = hdl;
556 ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID);
557 ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID);
558 ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID);
559 ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID);
560 ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID);
561 ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x",
562 ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev,
563 ena->ena_pci_svid, ena->ena_pci_sdid);
564
565 return (true);
566 }
567
568 static void
ena_cleanup_pci(ena_t * ena,bool resetting)569 ena_cleanup_pci(ena_t *ena, bool resetting)
570 {
571 VERIFY0(resetting);
572 pci_config_teardown(&ena->ena_pci_hdl);
573 }
574
575 static void
ena_cleanup_regs_map(ena_t * ena,bool resetting)576 ena_cleanup_regs_map(ena_t *ena, bool resetting)
577 {
578 VERIFY0(resetting);
579 ddi_regs_map_free(&ena->ena_reg_hdl);
580 }
581
582 static bool
ena_attach_regs_map(ena_t * ena)583 ena_attach_regs_map(ena_t *ena)
584 {
585 int ret = 0;
586
587 if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) !=
588 DDI_SUCCESS) {
589 ena_err(ena, "failed to get register set %d size",
590 ENA_REG_NUMBER);
591 return (false);
592 }
593
594 ena_dbg(ena, "register size: %ld", ena->ena_reg_size);
595 bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr));
596 ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1;
597 ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
598 ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
599
600 /*
601 * This function can return several different failure values,
602 * so we make sure to capture its return value for the purpose
603 * of logging.
604 */
605 ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER,
606 &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr,
607 &ena->ena_reg_hdl);
608
609 if (ret != DDI_SUCCESS) {
610 ena_err(ena, "failed to map register set %d: %d",
611 ENA_REG_NUMBER, ret);
612 return (false);
613 }
614
615 ena_dbg(ena, "registers mapped to base: 0x%p",
616 (void *)ena->ena_reg_base);
617
618 return (true);
619 }
620
621 /*
622 * Free any resources related to the admin submission queue.
623 */
624 static void
ena_admin_sq_free(ena_t * ena)625 ena_admin_sq_free(ena_t *ena)
626 {
627 ena_dma_free(&ena->ena_aq.ea_sq.eas_dma);
628 }
629
630 /*
631 * Initialize the admin submission queue.
632 */
633 static bool
ena_admin_sq_init(ena_t * ena)634 ena_admin_sq_init(ena_t *ena)
635 {
636 ena_adminq_t *aq = &ena->ena_aq;
637 ena_dma_buf_t *dma = &aq->ea_sq.eas_dma;
638 size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries);
639 uint32_t addr_low, addr_high, wval;
640
641 if (aq->ea_sq.eas_entries == NULL) {
642 ena_dma_conf_t conf = {
643 .edc_size = size,
644 .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT,
645 .edc_sgl = 1,
646 .edc_endian = DDI_NEVERSWAP_ACC,
647 .edc_stream = false,
648 };
649
650 if (!ena_dma_alloc(ena, dma, &conf, size)) {
651 ena_err(ena, "failed to allocate DMA for Admin SQ");
652 return (false);
653 }
654
655 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
656 aq->ea_sq.eas_entries = (void *)dma->edb_va;
657 } else {
658 ena_dma_bzero(dma);
659 }
660
661 aq->ea_sq.eas_tail = 0;
662 aq->ea_sq.eas_phase = 1;
663 aq->ea_sq.eas_dbaddr =
664 (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB);
665 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
666 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
667 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low);
668 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high);
669 wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) |
670 ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries));
671 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval);
672
673 return (true);
674 }
675
676 /*
677 * Free any resources related to the admin completion queue.
678 */
679 static void
ena_admin_cq_free(ena_t * ena)680 ena_admin_cq_free(ena_t *ena)
681 {
682 ena_dma_free(&ena->ena_aq.ea_cq.eac_dma);
683 }
684
685 /*
686 * Initialize the admin completion queue.
687 */
688 static bool
ena_admin_cq_init(ena_t * ena)689 ena_admin_cq_init(ena_t *ena)
690 {
691 ena_adminq_t *aq = &ena->ena_aq;
692 ena_dma_buf_t *dma = &aq->ea_cq.eac_dma;
693 uint32_t addr_low, addr_high, wval;
694
695 if (aq->ea_cq.eac_entries == NULL) {
696 size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries);
697 ena_dma_conf_t conf = {
698 .edc_size = size,
699 .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT,
700 .edc_sgl = 1,
701 .edc_endian = DDI_NEVERSWAP_ACC,
702 .edc_stream = false,
703 };
704
705 if (!ena_dma_alloc(ena, dma, &conf, size)) {
706 ena_err(ena, "failed to allocate DMA for Admin CQ");
707 return (false);
708 }
709
710 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
711 aq->ea_cq.eac_entries = (void *)dma->edb_va;
712 } else {
713 ena_dma_bzero(dma);
714 }
715
716 aq->ea_cq.eac_head = 0;
717 aq->ea_cq.eac_phase = 1;
718 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
719 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
720 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low);
721 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high);
722 wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) |
723 ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries));
724 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval);
725
726 return (true);
727 }
728
729 void
ena_update_hints(ena_t * ena,enahw_device_hints_t * hints)730 ena_update_hints(ena_t *ena, enahw_device_hints_t *hints)
731 {
732 ena->ena_device_hints.eh_mmio_read_timeout =
733 hints->edh_mmio_read_timeout;
734 ena->ena_device_hints.eh_keep_alive_timeout =
735 hints->edh_keep_alive_timeout;
736 ena->ena_device_hints.eh_tx_comp_timeout = hints->edh_tx_comp_timeout;
737 ena->ena_device_hints.eh_missed_tx_reset_threshold =
738 hints->edh_missed_tx_reset_threshold;
739 ena->ena_device_hints.eh_admin_comp_timeout =
740 hints->edh_admin_comp_timeout;
741 ena->ena_device_hints.eh_max_tx_sgl = hints->edh_max_tx_sgl;
742 ena->ena_device_hints.eh_max_rx_sgl = hints->edh_max_rx_sgl;
743 }
744
745 /*
746 * We limit the max number of I/O queues based on several aspects of
747 * the underlying hardware.
748 *
749 * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES,
750 * which comes from the common code and presumably is based on device
751 * constraints.
752 *
753 * 2. Next we latch the number of I/O queues to the number of online
754 * CPUs. The idea being that each queue is a parallel work stream,
755 * and having more queues than CPUs to flush them will not improve
756 * performance. The number of online CPUs can change dynamically,
757 * and that's okay, everything should still work fine, it just
758 * might not be ideal.
759 *
760 * 3. Next we latch the number of I/O queues to the smallest of the
761 * max Tx queues and max Rx queues. We could probably loosen this
762 * restriction in the future, and have separate max I/O queues for
763 * Tx and Rx. This is what Linux does, and seems like a fine place
764 * to start.
765 */
766 static void
ena_set_max_io_queues(ena_t * ena)767 ena_set_max_io_queues(ena_t *ena)
768 {
769 uint32_t max = ENAHW_MAX_NUM_IO_QUEUES;
770
771 max = MIN(ncpus_online, max);
772 /*
773 * Supposedly a device could present a different number of SQs
774 * and CQs. This driver is designed in a way that requires
775 * each SQ to have a corresponding and dedicated CQ (how would
776 * it work otherwise). Therefore, we must check both values
777 * and find the minimum between them.
778 */
779 max = MIN(ena->ena_tx_max_sq_num, max);
780 max = MIN(ena->ena_tx_max_cq_num, max);
781 max = MIN(ena->ena_rx_max_sq_num, max);
782 max = MIN(ena->ena_rx_max_cq_num, max);
783
784
785 /* This shouldn't happen, but just in case. */
786 if (max == 0) {
787 max = 1;
788 }
789
790 ena->ena_max_io_queues = max;
791 }
792
793 /*
794 * We require that an Rx or Tx buffer be able to hold the maximum MTU
795 * along with the maximum frame header length. In this case we know
796 * ENA is presenting us an Ethernet frame so we add the size of an
797 * Ethernet VLAN header. Rx has the additional requirement of needing
798 * additional margin for the sake of IP header alignment.
799 */
800 static void
ena_update_buf_sizes(ena_t * ena)801 ena_update_buf_sizes(ena_t *ena)
802 {
803 ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header);
804 ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu;
805 ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total,
806 ena->ena_page_sz, uint32_t);
807 ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total +
808 ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t);
809 }
810
811 static bool
ena_get_hints(ena_t * ena)812 ena_get_hints(ena_t *ena)
813 {
814 int ret;
815 enahw_resp_desc_t resp;
816 enahw_device_hints_t *hints = &resp.erd_resp.erd_get_feat.ergf_hints;
817
818 ena_dbg(ena, "Requesting hints");
819
820 bzero(&resp, sizeof (resp));
821 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_HW_HINTS,
822 ENAHW_FEAT_HW_HINTS_VER);
823
824 if (ret == ENOTSUP) {
825 /* In this case the device does not support querying hints */
826 ena_dbg(ena, "Hints are unsupported");
827 return (true);
828 } else if (ret != 0) {
829 ena_err(ena, "Error getting hints: %d", ret);
830 return (false);
831 }
832
833 ena_update_hints(ena, hints);
834
835 return (true);
836 }
837
838 static bool
ena_get_offloads(ena_t * ena)839 ena_get_offloads(ena_t *ena)
840 {
841 int ret = 0;
842 enahw_resp_desc_t resp;
843 enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload;
844
845 ena->ena_tx_l3_ipv4_csum = false;
846
847 ena->ena_tx_l4_ipv4_part_csum = false;
848 ena->ena_tx_l4_ipv4_full_csum = false;
849 ena->ena_tx_l4_ipv4_lso = false;
850
851 ena->ena_tx_l4_ipv6_part_csum = false;
852 ena->ena_tx_l4_ipv6_full_csum = false;
853 ena->ena_tx_l4_ipv6_lso = false;
854
855 ena->ena_rx_l3_ipv4_csum = false;
856 ena->ena_rx_l4_ipv4_csum = false;
857 ena->ena_rx_l4_ipv6_csum = false;
858 ena->ena_rx_hash = false;
859
860 bzero(&resp, sizeof (resp));
861 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG,
862 ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER);
863
864 if (ret == ENOTSUP) {
865 /*
866 * In this case the device does not support querying
867 * for hardware offloads. We take that as a sign that
868 * the device provides no offloads.
869 */
870 return (true);
871 } else if (ret != 0) {
872 ena_err(ena, "error getting stateless offload: %d", ret);
873 return (false);
874 }
875
876 ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat);
877
878 ena->ena_tx_l4_ipv4_part_csum =
879 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat);
880 ena->ena_tx_l4_ipv4_full_csum =
881 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat);
882 ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat);
883
884 ena->ena_tx_l4_ipv6_part_csum =
885 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat);
886 ena->ena_tx_l4_ipv6_full_csum =
887 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat);
888 ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat);
889
890 ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat);
891 ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat);
892 ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat);
893 return (true);
894 }
895
896 static int
ena_get_prop(ena_t * ena,char * propname,const int minval,const int maxval,const int defval)897 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval,
898 const int defval)
899 {
900 int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip,
901 DDI_PROP_DONTPASS, propname, defval);
902
903 if (value > maxval) {
904 ena_err(ena, "user value %s=%d exceeded maximum, setting to %d",
905 propname, value, maxval);
906 value = maxval;
907 }
908
909 if (value < minval) {
910 ena_err(ena, "user value %s=%d below minimum, setting to %d",
911 propname, value, minval);
912 value = minval;
913 }
914
915 return (value);
916 }
917
918 static bool
ena_set_mtu(ena_t * ena)919 ena_set_mtu(ena_t *ena)
920 {
921 int ret = 0;
922 enahw_cmd_desc_t cmd;
923 enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu;
924 enahw_resp_desc_t resp;
925
926 bzero(&cmd, sizeof (cmd));
927 bzero(&resp, sizeof (resp));
928 feat->efm_mtu = ena->ena_mtu;
929
930 if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU,
931 ENAHW_FEAT_MTU_VER)) != 0) {
932 ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu,
933 ret);
934 return (false);
935 }
936
937 return (true);
938 }
939
940 static void
ena_get_link_config(ena_t * ena)941 ena_get_link_config(ena_t *ena)
942 {
943 enahw_resp_desc_t resp;
944 enahw_feat_link_conf_t *feat =
945 &resp.erd_resp.erd_get_feat.ergf_link_conf;
946 bool full_duplex;
947
948 bzero(&resp, sizeof (resp));
949
950 if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG,
951 ENAHW_FEAT_LINK_CONFIG_VER) != 0) {
952 /*
953 * Some ENA devices do no support this feature. In
954 * those cases we report a 1Gbps link, full duplex.
955 * For the most accurate information on bandwidth
956 * limits see the official AWS documentation.
957 */
958 ena->ena_link_speed_mbits = 1000;
959 ena->ena_link_speeds = ENAHW_LINK_SPEED_1G;
960 ena->ena_link_duplex = LINK_DUPLEX_FULL;
961 ena->ena_link_autoneg = true;
962 return;
963 }
964
965 ena->ena_link_speed_mbits = feat->eflc_speed;
966 ena->ena_link_speeds = feat->eflc_supported;
967 full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat);
968 ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL :
969 LINK_DUPLEX_HALF;
970 ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat);
971 }
972
973 /*
974 * Retrieve all configuration values which are modifiable via
975 * ena.conf, and set ena_t members accordingly. While the conf values
976 * have priority, they may be implicitly modified by the driver to
977 * meet resource constraints on a given platform. If no value is
978 * specified in the conf file, the driver will attempt to use the
979 * largest value supported. While there should be no value large
980 * enough, keep in mind that ena_get_prop() will cast the values to an
981 * int.
982 *
983 * This function should be called after the device is initialized,
984 * admin queue is established, and the hardware features/capabs have
985 * been queried; it should be called before mac registration.
986 */
987 static bool
ena_attach_read_conf(ena_t * ena)988 ena_attach_read_conf(ena_t *ena)
989 {
990 uint32_t gcv; /* Greatest Common Value */
991
992 /*
993 * We expect that the queue lengths are the same for both the
994 * CQ and SQ, but technically the device could return
995 * different lengths. For now the driver locks them together.
996 */
997 gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs);
998 ASSERT3U(gcv, <=, INT_MAX);
999 ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS,
1000 ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv);
1001
1002 ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT,
1003 ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX,
1004 ENA_PROP_RXQ_INTR_LIMIT_DEF);
1005
1006 gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs);
1007 ASSERT3U(gcv, <=, INT_MAX);
1008 ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS,
1009 ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv);
1010
1011 return (true);
1012 }
1013
1014 /*
1015 * Perform any necessary device configuration after the driver.conf
1016 * has been read.
1017 */
1018 static bool
ena_attach_dev_cfg(ena_t * ena)1019 ena_attach_dev_cfg(ena_t *ena)
1020 {
1021 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF);
1022
1023 if (!ena_set_mtu(ena)) {
1024 /*
1025 * We don't expect this to fail, but we try a fallback
1026 * first before failing the attach sequence.
1027 */
1028 ena->ena_mtu = 1500;
1029 ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu);
1030
1031 if (!ena_set_mtu(ena)) {
1032 return (false);
1033 }
1034 }
1035
1036 return (true);
1037 }
1038
1039 static bool
ena_check_versions(ena_t * ena)1040 ena_check_versions(ena_t *ena)
1041 {
1042 uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION);
1043 uint32_t ctrl_vsn =
1044 ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION);
1045
1046 ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn);
1047 ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn);
1048
1049 ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn);
1050 ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn);
1051 ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn);
1052 ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn);
1053
1054 ena_dbg(ena, "device version: %u.%u",
1055 ena->ena_dev_major_vsn, ena->ena_dev_minor_vsn);
1056 ena_dbg(ena, "controller version: %u.%u.%u implementation %u",
1057 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1058 ena->ena_ctrl_subminor_vsn, ena->ena_ctrl_impl_id);
1059
1060 if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) {
1061 ena_err(ena, "unsupported controller version: %u.%u.%u",
1062 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
1063 ena->ena_ctrl_subminor_vsn);
1064 return (false);
1065 }
1066
1067 return (true);
1068 }
1069
1070 static bool
ena_adminq_init(ena_t * ena)1071 ena_adminq_init(ena_t *ena)
1072 {
1073 ena_adminq_t *aq = &ena->ena_aq;
1074
1075 /*
1076 * As we are not using an interrupt for admin queue completion
1077 * signaling, we do not need a priority on these mutexes. If
1078 * that changes, we will have to rejigger some code to create
1079 * the admin queue interrupt before this function.
1080 */
1081 mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL);
1082 mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL);
1083 mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL);
1084 aq->ea_qlen = ENA_ADMINQ_DEPTH;
1085 aq->ea_pending_cmds = 0;
1086
1087 aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen,
1088 KM_SLEEP);
1089 list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t),
1090 offsetof(ena_cmd_ctx_t, ectx_node));
1091 list_create(&aq->ea_cmd_ctxs_used, sizeof (ena_cmd_ctx_t),
1092 offsetof(ena_cmd_ctx_t, ectx_node));
1093
1094 ena_create_cmd_ctx(ena);
1095
1096 /*
1097 * Start in polling mode until we've determined the number of queues
1098 * and are ready to configure and enable interrupts.
1099 */
1100 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK);
1101 aq->ea_poll_mode = true;
1102
1103 return (true);
1104 }
1105
1106 /*
1107 * Free all resources allocated as part of ena_device_init().
1108 */
1109 static void
ena_cleanup_device_init(ena_t * ena,bool resetting)1110 ena_cleanup_device_init(ena_t *ena, bool resetting)
1111 {
1112 ena_adminq_t *aq = &ena->ena_aq;
1113
1114 VERIFY0(resetting);
1115
1116 ena_free_host_info(ena);
1117 mutex_destroy(&aq->ea_sq_lock);
1118 mutex_destroy(&aq->ea_cq_lock);
1119 mutex_destroy(&aq->ea_stat_lock);
1120 list_destroy(&aq->ea_cmd_ctxs_free);
1121 list_destroy(&aq->ea_cmd_ctxs_used);
1122 kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen);
1123 ena_admin_sq_free(ena);
1124 ena_admin_cq_free(ena);
1125 ena_aenq_free(ena);
1126 ena_stat_device_cleanup(ena);
1127 ena_stat_device_basic_cleanup(ena);
1128 ena_stat_device_extended_cleanup(ena);
1129 ena_stat_aenq_cleanup(ena);
1130 }
1131
1132 static bool
ena_attach_device_init(ena_t * ena)1133 ena_attach_device_init(ena_t *ena)
1134 {
1135 ena_adminq_t *aq = &ena->ena_aq;
1136 uint32_t rval;
1137 uint8_t dma_width;
1138 hrtime_t cmd_timeout;
1139 enahw_resp_desc_t resp;
1140 enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr;
1141 uint8_t *maddr;
1142 uint32_t supported_features;
1143 int ret = 0;
1144
1145 ena->ena_reset_reason = ENAHW_RESET_NORMAL;
1146 if (!ena_device_reset(ena, ena->ena_reset_reason))
1147 return (false);
1148
1149 if (!ena_check_versions(ena))
1150 return (false);
1151
1152 ena_init_regcache(ena);
1153
1154 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
1155 dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval);
1156 ena->ena_dma_width = dma_width;
1157
1158 /*
1159 * The value stored in the device register is in the
1160 * resolution of 100 milliseconds. We normalize that to
1161 * nanoseconds.
1162 */
1163 cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100);
1164 aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns);
1165
1166 if (aq->ea_cmd_timeout_ns == 0)
1167 aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS;
1168
1169 if (!ena_adminq_init(ena))
1170 return (false);
1171
1172 if (!ena_admin_sq_init(ena))
1173 return (false);
1174
1175 if (!ena_admin_cq_init(ena))
1176 return (false);
1177
1178 if (!ena_aenq_init(ena))
1179 return (false);
1180
1181 bzero(&resp, sizeof (resp));
1182 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES,
1183 ENAHW_FEAT_DEVICE_ATTRIBUTES_VER);
1184
1185 if (ret != 0) {
1186 ena_err(ena, "failed to get device attributes: %d", ret);
1187 return (false);
1188 }
1189
1190 ena_dbg(ena, "impl ID: %u", feat->efda_impl_id);
1191 ena_dbg(ena, "device version: %u", feat->efda_device_version);
1192 ena_dbg(ena, "supported features: 0x%x",
1193 feat->efda_supported_features);
1194 ena_dbg(ena, "device capabilities: 0x%x", feat->efda_capabilities);
1195 ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width);
1196 ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with);
1197 maddr = feat->efda_mac_addr;
1198 ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1],
1199 maddr[2], maddr[3], maddr[4], maddr[5]);
1200 ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu);
1201
1202 bcopy(maddr, ena->ena_mac_addr, ETHERADDRL);
1203 ena->ena_max_mtu = feat->efda_max_mtu;
1204 ena->ena_capabilities = feat->efda_capabilities;
1205 supported_features = feat->efda_supported_features;
1206 ena->ena_supported_features = supported_features;
1207 feat = NULL;
1208 bzero(&resp, sizeof (resp));
1209
1210 if (ena_is_feat_avail(ena, ENAHW_FEAT_MAX_QUEUES_EXT)) {
1211 enahw_feat_max_queue_ext_t *feat_mqe =
1212 &resp.erd_resp.erd_get_feat.ergf_max_queue_ext;
1213
1214 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT,
1215 ENAHW_FEAT_MAX_QUEUES_EXT_VER);
1216
1217 if (ret != 0) {
1218 ena_err(ena, "failed to query max queues ext: %d", ret);
1219 return (false);
1220 }
1221
1222 ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num;
1223 ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth;
1224 ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num;
1225 ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth;
1226 ena->ena_tx_max_desc_per_pkt =
1227 feat_mqe->efmqe_max_per_packet_tx_descs;
1228 ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size;
1229
1230 ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num;
1231 ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth;
1232 ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num;
1233 ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth;
1234 ena->ena_rx_max_desc_per_pkt =
1235 feat_mqe->efmqe_max_per_packet_rx_descs;
1236
1237 ena_set_max_io_queues(ena);
1238 } else {
1239 enahw_feat_max_queue_t *feat_mq =
1240 &resp.erd_resp.erd_get_feat.ergf_max_queue;
1241
1242 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM,
1243 ENAHW_FEAT_MAX_QUEUES_NUM_VER);
1244
1245 if (ret != 0) {
1246 ena_err(ena, "failed to query max queues: %d", ret);
1247 return (false);
1248 }
1249
1250 ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num;
1251 ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1252 ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num;
1253 ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1254 ena->ena_tx_max_desc_per_pkt =
1255 feat_mq->efmq_max_per_packet_tx_descs;
1256 ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size;
1257
1258 ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num;
1259 ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
1260 ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num;
1261 ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
1262 ena->ena_rx_max_desc_per_pkt =
1263 feat_mq->efmq_max_per_packet_rx_descs;
1264
1265 ena_set_max_io_queues(ena);
1266 }
1267
1268 ena->ena_mtu = ena->ena_max_mtu;
1269 ena_update_buf_sizes(ena);
1270
1271 if (!ena_get_hints(ena))
1272 return (false);
1273
1274 ena->ena_tx_sgl_max_sz = 1;
1275 ena->ena_rx_sgl_max_sz = 1;
1276 if (ena->ena_device_hints.eh_max_tx_sgl != 0)
1277 ena->ena_tx_sgl_max_sz = ena->ena_device_hints.eh_max_tx_sgl;
1278 if (ena->ena_device_hints.eh_max_rx_sgl != 0)
1279 ena->ena_rx_sgl_max_sz = ena->ena_device_hints.eh_max_rx_sgl;
1280
1281 if (!ena_init_host_info(ena))
1282 return (false);
1283
1284 if (!ena_aenq_configure(ena))
1285 return (false);
1286
1287 ena_get_link_config(ena);
1288
1289 if (!ena_get_offloads(ena))
1290 return (false);
1291
1292 if (!ena_stat_device_init(ena))
1293 return (false);
1294
1295 if (!ena_stat_device_basic_init(ena))
1296 return (false);
1297
1298 if (!ena_stat_device_extended_init(ena))
1299 return (false);
1300
1301 if (!ena_stat_aenq_init(ena))
1302 return (false);
1303
1304 ena_update_regcache(ena);
1305
1306 return (true);
1307 }
1308
1309 static void
ena_cleanup_intr_alloc(ena_t * ena,bool resetting)1310 ena_cleanup_intr_alloc(ena_t *ena, bool resetting)
1311 {
1312 VERIFY0(resetting);
1313
1314 for (int i = 0; i < ena->ena_num_intrs; i++) {
1315 int ret = ddi_intr_free(ena->ena_intr_handles[i]);
1316 if (ret != DDI_SUCCESS) {
1317 ena_err(ena, "failed to free interrupt %d: %d", i, ret);
1318 }
1319 }
1320
1321 if (ena->ena_intr_handles != NULL) {
1322 kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz);
1323 ena->ena_intr_handles = NULL;
1324 ena->ena_intr_handles_sz = 0;
1325 }
1326 }
1327
1328 /*
1329 * The Linux driver supports only MSI-X interrupts. We do the same,
1330 * with the assumption that it's the only type of interrupt the device
1331 * can present.
1332 */
1333 static bool
ena_attach_intr_alloc(ena_t * ena)1334 ena_attach_intr_alloc(ena_t *ena)
1335 {
1336 int ret;
1337 int types;
1338 int min, req, ideal, avail, actual;
1339
1340 ret = ddi_intr_get_supported_types(ena->ena_dip, &types);
1341 if (ret != DDI_SUCCESS) {
1342 ena_err(ena, "failed to get interrupt types: %d", ret);
1343 return (false);
1344 }
1345
1346 ena_dbg(ena, "supported interrupt types: 0x%x", types);
1347 if ((types & DDI_INTR_TYPE_MSIX) == 0) {
1348 ena_err(ena, "the ena driver only supports MSI-X interrupts");
1349 return (false);
1350 }
1351
1352 /* One for I/O, one for adminq. */
1353 min = 2;
1354 ideal = ena->ena_max_io_queues + 1;
1355 ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1356 if (ret != DDI_SUCCESS) {
1357 ena_err(ena, "failed to get number of MSI-X interrupts: %d",
1358 ret);
1359 return (false);
1360 }
1361
1362 if (avail < min) {
1363 ena_err(ena, "number of MSI-X interrupts is %d, but the driver "
1364 "requires a minimum of %d", avail, min);
1365 return (false);
1366 }
1367
1368 ena_dbg(ena, "%d MSI-X interrupts available", avail);
1369
1370 ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
1371 if (ret != DDI_SUCCESS) {
1372 ena_err(ena, "failed to get available interrupts: %d", ret);
1373 return (false);
1374 }
1375
1376 if (avail < min) {
1377 ena_err(ena, "number of available MSI-X interrupts is %d, "
1378 "but the driver requires a minimum of %d", avail, min);
1379 return (false);
1380 }
1381
1382 req = MIN(ideal, avail);
1383 ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t);
1384 ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP);
1385
1386 ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles,
1387 DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL);
1388 if (ret != DDI_SUCCESS) {
1389 ena_err(ena, "failed to allocate %d MSI-X interrupts: %d",
1390 req, ret);
1391 return (false);
1392 }
1393
1394 if (actual < min) {
1395 ena_err(ena, "number of allocated interrupts is %d, but the "
1396 "driver requires a minimum of %d", actual, min);
1397 return (false);
1398 }
1399
1400 ena->ena_num_intrs = actual;
1401
1402 ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps);
1403 if (ret != DDI_SUCCESS) {
1404 ena_err(ena, "failed to get interrupt capability: %d", ret);
1405 return (false);
1406 }
1407
1408 ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri);
1409 if (ret != DDI_SUCCESS) {
1410 ena_err(ena, "failed to get interrupt priority: %d", ret);
1411 return (false);
1412 }
1413
1414 ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u",
1415 actual, ena->ena_intr_caps, ena->ena_intr_pri);
1416
1417 /*
1418 * The ena_lock should not be held in the data path, but it is
1419 * held as part of the AENQ handler, which runs in interrupt
1420 * context. Therefore, we delayed the initialization of this
1421 * mutex until after the interrupts are allocated.
1422 */
1423 mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER,
1424 DDI_INTR_PRI(ena->ena_intr_pri));
1425 mutex_init(&ena->ena_watchdog_lock, NULL, MUTEX_DRIVER, NULL);
1426
1427 return (true);
1428 }
1429
1430 /*
1431 * Allocate the parent Rx queue structures. More importantly, this is
1432 * NOT allocating the queue descriptors or data buffers. Those are
1433 * allocated on demand as queues are started.
1434 */
1435 static bool
ena_attach_alloc_rxqs(ena_t * ena)1436 ena_attach_alloc_rxqs(ena_t *ena)
1437 {
1438 bool resetting = false;
1439
1440 if (ena->ena_rxqs == NULL) {
1441 /*
1442 * We rely on the interrupt priority for initializing the
1443 * mutexes.
1444 */
1445 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1446 ena->ena_num_rxqs = ena->ena_num_intrs - 1;
1447 ASSERT3U(ena->ena_num_rxqs, >, 0);
1448 ena->ena_rxqs = kmem_zalloc(
1449 ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), KM_SLEEP);
1450 } else {
1451 resetting = true;
1452 }
1453
1454 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1455 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1456
1457 rxq->er_rxqs_idx = i;
1458 /* The 0th vector is for Admin + AENQ. */
1459 rxq->er_intr_vector = i + 1;
1460 rxq->er_mrh = NULL;
1461
1462 if (!resetting) {
1463 mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER,
1464 DDI_INTR_PRI(ena->ena_intr_pri));
1465 mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER,
1466 DDI_INTR_PRI(ena->ena_intr_pri));
1467 }
1468
1469 rxq->er_ena = ena;
1470 rxq->er_sq_num_descs = ena->ena_rxq_num_descs;
1471 rxq->er_cq_num_descs = ena->ena_rxq_num_descs;
1472
1473 if (!ena_stat_rxq_init(rxq)) {
1474 return (false);
1475 }
1476
1477 if (!ena_alloc_rxq(rxq)) {
1478 ena_stat_rxq_cleanup(rxq);
1479 return (false);
1480 }
1481 }
1482
1483 return (true);
1484 }
1485
1486 static void
ena_cleanup_rxqs(ena_t * ena,bool resetting)1487 ena_cleanup_rxqs(ena_t *ena, bool resetting)
1488 {
1489 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1490 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1491
1492 ena_cleanup_rxq(rxq, resetting);
1493 if (!resetting) {
1494 mutex_destroy(&rxq->er_lock);
1495 mutex_destroy(&rxq->er_stat_lock);
1496 }
1497 ena_stat_rxq_cleanup(rxq);
1498 }
1499
1500 if (!resetting) {
1501 kmem_free(ena->ena_rxqs,
1502 ena->ena_num_rxqs * sizeof (*ena->ena_rxqs));
1503 ena->ena_rxqs = NULL;
1504 }
1505 }
1506
1507 /*
1508 * Allocate the parent Tx queue structures. More importantly, this is
1509 * NOT allocating the queue descriptors or data buffers. Those are
1510 * allocated on demand as a queue is started.
1511 */
1512 static bool
ena_attach_alloc_txqs(ena_t * ena)1513 ena_attach_alloc_txqs(ena_t *ena)
1514 {
1515 bool resetting = false;
1516
1517 if (ena->ena_txqs == NULL) {
1518 /*
1519 * We rely on the interrupt priority for initializing the
1520 * mutexes.
1521 */
1522 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
1523 ena->ena_num_txqs = ena->ena_num_intrs - 1;
1524 ASSERT3U(ena->ena_num_txqs, >, 0);
1525 ena->ena_txqs = kmem_zalloc(
1526 ena->ena_num_txqs * sizeof (*ena->ena_txqs), KM_SLEEP);
1527 } else {
1528 resetting = true;
1529 }
1530
1531 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1532 ena_txq_t *txq = &ena->ena_txqs[i];
1533
1534 txq->et_txqs_idx = i;
1535 /* The 0th vector is for Admin + AENQ. */
1536 txq->et_intr_vector = i + 1;
1537 txq->et_mrh = NULL;
1538
1539 if (!resetting) {
1540 mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER,
1541 DDI_INTR_PRI(ena->ena_intr_pri));
1542 mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER,
1543 DDI_INTR_PRI(ena->ena_intr_pri));
1544 }
1545
1546 txq->et_ena = ena;
1547 txq->et_sq_num_descs = ena->ena_txq_num_descs;
1548 txq->et_cq_num_descs = ena->ena_txq_num_descs;
1549
1550 if (!ena_stat_txq_init(txq)) {
1551 return (false);
1552 }
1553
1554 if (!ena_alloc_txq(txq)) {
1555 ena_stat_txq_cleanup(txq);
1556 return (false);
1557 }
1558 }
1559
1560 return (true);
1561 }
1562
1563 static void
ena_cleanup_txqs(ena_t * ena,bool resetting)1564 ena_cleanup_txqs(ena_t *ena, bool resetting)
1565 {
1566 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1567 ena_txq_t *txq = &ena->ena_txqs[i];
1568
1569 ena_cleanup_txq(txq, resetting);
1570 if (!resetting) {
1571 mutex_destroy(&txq->et_lock);
1572 mutex_destroy(&txq->et_stat_lock);
1573 }
1574 ena_stat_txq_cleanup(txq);
1575 }
1576
1577 if (!resetting) {
1578 kmem_free(ena->ena_txqs,
1579 ena->ena_num_txqs * sizeof (*ena->ena_txqs));
1580 ena->ena_txqs = NULL;
1581 }
1582 }
1583
1584 /*
1585 * To reset the device we need to unwind some of the steps taken during attach
1586 * but, since the device could well be in a failed state, we cannot rely on
1587 * being able to talk via the admin queue to do things such as explicitly
1588 * destroy rings. We call selected cleanup handlers with the second parameter
1589 * set to "true" to indicate that we are resetting and should avoid such
1590 * communication.
1591 *
1592 * The existing DMA memory regions for the admin queue, async event queue and
1593 * host information are preserved but have their contents zeroed.
1594 * Experimentation has shown that the device hangs onto old async event queue
1595 * addresses, even through a reset, with surprising results if the addresses
1596 * happen to change.
1597 *
1598 * We clean up all of the Tx and Rx ring descriptors and the TCBs but leave the
1599 * allocated memory for the ring data and mutexes intact. Pointers to this
1600 * memory have already been provided to MAC, and the mutexes keep the rings
1601 * locked until we're ready to start them again.
1602 *
1603 * To ensure that other driver activity is excluded, we hold the mutexes on the
1604 * Tx and Rx rings throughout, and unset the `ENA_STATE_STARTED` bit in the
1605 * state, which causes the interrupt handlers to return without doing any work.
1606 * The admin interrupt, used for notifications of admin completions or new
1607 * asynchronous events, is masked after the device is reset until we're ready
1608 * to process them again.
1609 */
1610 bool
ena_reset(ena_t * ena,const enahw_reset_reason_t reason)1611 ena_reset(ena_t *ena, const enahw_reset_reason_t reason)
1612 {
1613 ena_txq_state_t tx_state[ena->ena_num_txqs];
1614 ena_rxq_state_t rx_state[ena->ena_num_rxqs];
1615 bool ret = false;
1616
1617 ena_err(ena, "resetting device with reason 0x%x [%s]",
1618 reason, enahw_reset_reason(reason));
1619
1620 VERIFY0(ena->ena_state & ENA_STATE_RESETTING);
1621 atomic_or_32(&ena->ena_state, ENA_STATE_RESETTING);
1622
1623 VERIFY(ena->ena_state & ENA_STATE_STARTED);
1624 atomic_and_32(&ena->ena_state, ~ENA_STATE_STARTED);
1625
1626 mutex_enter(&ena->ena_lock);
1627
1628 ena_update_regcache(ena);
1629
1630 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1631 ena_txq_t *txq = &ena->ena_txqs[i];
1632
1633 mutex_enter(&txq->et_lock);
1634 tx_state[i] = txq->et_state;
1635 if (txq->et_state & ENA_TXQ_STATE_RUNNING)
1636 ena_ring_tx_stop((mac_ring_driver_t)txq);
1637 }
1638
1639 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1640 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1641
1642 mutex_enter(&rxq->er_lock);
1643 rx_state[i] = rxq->er_state;
1644 if (rxq->er_state & ENA_RXQ_STATE_RUNNING)
1645 ena_ring_rx_stop((mac_ring_driver_t)rxq);
1646 }
1647
1648 if (!ena_device_reset(ena, reason)) {
1649 ena_err(ena, "reset: failed to reset device");
1650 goto out;
1651 }
1652
1653 /* This masks the admin/aenq interrupt */
1654 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK);
1655
1656 ena_cleanup_txqs(ena, true);
1657 ena_cleanup_rxqs(ena, true);
1658
1659 ena_release_all_cmd_ctx(ena);
1660
1661 if (!ena_admin_cq_init(ena) || !ena_admin_sq_init(ena)) {
1662 ena_err(ena, "reset: failed to program admin queues");
1663 goto out;
1664 }
1665
1666 if (!ena_init_host_info(ena)) {
1667 ena_err(ena, "reset: failed to set host info");
1668 goto out;
1669 }
1670
1671 if (!ena_aenq_init(ena) || !ena_aenq_configure(ena)) {
1672 ena_err(ena, "reset: failed to configure aenq");
1673 goto out;
1674 }
1675
1676 if (!ena_set_mtu(ena)) {
1677 ena_err(ena, "reset: failed to set MTU");
1678 goto out;
1679 }
1680
1681 if (!ena_attach_alloc_txqs(ena) || !ena_attach_alloc_rxqs(ena)) {
1682 ena_err(ena, "reset: failed to program IO queues");
1683 goto out;
1684 }
1685
1686 ena_aenq_enable(ena);
1687 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK);
1688
1689 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
1690 ena_rxq_t *rxq = &ena->ena_rxqs[i];
1691
1692 mutex_exit(&rxq->er_lock);
1693 if (rx_state[i] & ENA_RXQ_STATE_RUNNING) {
1694 (void) ena_ring_rx_start((mac_ring_driver_t)rxq,
1695 rxq->er_m_gen_num);
1696 }
1697 }
1698
1699 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
1700 ena_txq_t *txq = &ena->ena_txqs[i];
1701
1702 mutex_exit(&txq->et_lock);
1703 if (tx_state[i] & ENA_TXQ_STATE_RUNNING) {
1704 (void) ena_ring_tx_start((mac_ring_driver_t)txq,
1705 txq->et_m_gen_num);
1706 }
1707 }
1708
1709 atomic_or_32(&ena->ena_state, ENA_STATE_STARTED);
1710 ret = true;
1711
1712 out:
1713 atomic_and_32(&ena->ena_state, ~ENA_STATE_RESETTING);
1714 mutex_exit(&ena->ena_lock);
1715
1716 ena_update_regcache(ena);
1717
1718 return (ret);
1719 }
1720
1721 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = {
1722 {
1723 .ead_seq = ENA_ATTACH_PCI,
1724 .ead_name = "PCI config",
1725 .ead_attach_fn = ena_attach_pci,
1726 .ead_attach_hard_fail = true,
1727 .ead_cleanup_fn = ena_cleanup_pci,
1728 },
1729
1730 {
1731 .ead_seq = ENA_ATTACH_REGS,
1732 .ead_name = "BAR mapping",
1733 .ead_attach_fn = ena_attach_regs_map,
1734 .ead_attach_hard_fail = true,
1735 .ead_cleanup_fn = ena_cleanup_regs_map,
1736 },
1737
1738 {
1739 .ead_seq = ENA_ATTACH_DEV_INIT,
1740 .ead_name = "device initialization",
1741 .ead_attach_fn = ena_attach_device_init,
1742 .ead_attach_hard_fail = true,
1743 .ead_cleanup_fn = ena_cleanup_device_init,
1744 },
1745
1746 {
1747 .ead_seq = ENA_ATTACH_READ_CONF,
1748 .ead_name = "ena.conf",
1749 .ead_attach_fn = ena_attach_read_conf,
1750 .ead_attach_hard_fail = true,
1751 .ead_cleanup_fn = NULL,
1752 },
1753
1754 {
1755 .ead_seq = ENA_ATTACH_DEV_CFG,
1756 .ead_name = "device config",
1757 .ead_attach_fn = ena_attach_dev_cfg,
1758 .ead_attach_hard_fail = true,
1759 .ead_cleanup_fn = NULL,
1760 },
1761
1762 {
1763 .ead_seq = ENA_ATTACH_INTR_ALLOC,
1764 .ead_name = "interrupt allocation",
1765 .ead_attach_fn = ena_attach_intr_alloc,
1766 .ead_attach_hard_fail = true,
1767 .ead_cleanup_fn = ena_cleanup_intr_alloc,
1768 },
1769
1770 {
1771 .ead_seq = ENA_ATTACH_INTR_HDLRS,
1772 .ead_name = "interrupt handlers",
1773 .ead_attach_fn = ena_intr_add_handlers,
1774 .ead_attach_hard_fail = true,
1775 .ead_cleanup_fn = ena_intr_remove_handlers,
1776 },
1777
1778 {
1779 .ead_seq = ENA_ATTACH_TXQS_ALLOC,
1780 .ead_name = "Tx queues",
1781 .ead_attach_fn = ena_attach_alloc_txqs,
1782 .ead_attach_hard_fail = true,
1783 .ead_cleanup_fn = ena_cleanup_txqs,
1784 },
1785
1786 {
1787 .ead_seq = ENA_ATTACH_RXQS_ALLOC,
1788 .ead_name = "Rx queues",
1789 .ead_attach_fn = ena_attach_alloc_rxqs,
1790 .ead_attach_hard_fail = true,
1791 .ead_cleanup_fn = ena_cleanup_rxqs,
1792 },
1793
1794 /*
1795 * The chance of mac_unregister() failure poses a problem to
1796 * cleanup. We address interrupt disablement and mac
1797 * unregistration explicitly in the attach/detach routines.
1798 */
1799 {
1800 .ead_seq = ENA_ATTACH_MAC_REGISTER,
1801 .ead_name = "mac registration",
1802 .ead_attach_fn = ena_mac_register,
1803 .ead_attach_hard_fail = true,
1804 .ead_cleanup_fn = NULL,
1805 },
1806
1807 {
1808 .ead_seq = ENA_ATTACH_INTRS_ENABLE,
1809 .ead_name = "enable interrupts",
1810 .ead_attach_fn = ena_intrs_enable,
1811 .ead_attach_hard_fail = true,
1812 .ead_cleanup_fn = NULL,
1813 }
1814 };
1815
1816 /*
1817 * This function undoes any work done by ena_attach(), either in
1818 * response to a failed attach or a planned detach. At the end of this
1819 * function ena_attach_seq should be zero, otherwise it means
1820 * something has not be freed/uninitialized.
1821 */
1822 static void
ena_cleanup(ena_t * ena)1823 ena_cleanup(ena_t *ena)
1824 {
1825 if (ena == NULL || ena->ena_attach_seq == 0) {
1826 return;
1827 }
1828
1829 /*
1830 * We VERIFY this because if the seq is greater than entries
1831 * we drift into space and execute god knows what.
1832 */
1833 VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES);
1834
1835 while (ena->ena_attach_seq > 0) {
1836 int idx = ena->ena_attach_seq - 1;
1837 ena_attach_desc_t *desc = &ena_attach_tbl[idx];
1838
1839 ena_dbg(ena, "running cleanup sequence: %s (%d)",
1840 desc->ead_name, idx);
1841
1842 if (desc->ead_cleanup_fn != NULL)
1843 desc->ead_cleanup_fn(ena, false);
1844 ena->ena_attach_seq--;
1845 }
1846
1847 ASSERT3U(ena->ena_attach_seq, ==, 0);
1848 mutex_destroy(&ena->ena_lock);
1849 mutex_destroy(&ena->ena_watchdog_lock);
1850 }
1851
1852 static int
ena_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1853 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1854 {
1855 ena_t *ena;
1856
1857 if (cmd != DDI_ATTACH) {
1858 return (DDI_FAILURE);
1859 }
1860
1861 ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP);
1862 ena->ena_instance = ddi_get_instance(dip);
1863 ena->ena_dip = dip;
1864 ena->ena_instance = ddi_get_instance(dip);
1865 ena->ena_page_sz = ddi_ptob(dip, 1);
1866
1867 for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) {
1868 bool success;
1869 ena_attach_desc_t *desc = &ena_attach_tbl[i];
1870
1871 ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name,
1872 i);
1873
1874 if (!(success = desc->ead_attach_fn(ena))) {
1875 ena_err(ena, "attach sequence failed: %s (%d)",
1876 desc->ead_name, i);
1877
1878 if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) {
1879 /*
1880 * In this specific case
1881 * ENA_ATTACH_INTRS_ENABLE has failed,
1882 * and we may or may not be able to
1883 * unregister the mac, depending on if
1884 * something in userspace has created
1885 * a client on top.
1886 *
1887 * NOTE: Something that would be nice
1888 * to add to mac is the ability to
1889 * register a provider separate from
1890 * "publishing" it to the rest of the
1891 * system. This would allow a driver
1892 * to register its mac, do some
1893 * additional work that might fail,
1894 * and then unregister if that work
1895 * fails without concern for any
1896 * chance of failure when calling
1897 * unregister. This would remove the
1898 * complexity of the situation we are
1899 * trying to address here, as we would
1900 * know that until the mac has been
1901 * "published", there is no chance for
1902 * mac_unregister() to fail.
1903 */
1904 if (ena_mac_unregister(ena) != 0) {
1905 return (DDI_FAILURE);
1906 }
1907
1908 ena->ena_attach_seq--;
1909 } else {
1910 /*
1911 * Since the ead_seq is predicated on
1912 * successful ead_attach_fn we must
1913 * run the specific cleanup handler
1914 * before calling the global cleanup
1915 * routine. This also means that all
1916 * cleanup functions must be able to
1917 * deal with partial success of the
1918 * corresponding ead_attach_fn.
1919 */
1920 if (desc->ead_cleanup_fn != NULL)
1921 desc->ead_cleanup_fn(ena, false);
1922 }
1923
1924 ena_cleanup(ena);
1925 kmem_free(ena, sizeof (ena_t));
1926 return (DDI_FAILURE);
1927 }
1928
1929 if (success) {
1930 ena_dbg(ena, "attach sequence completed: %s (%d)",
1931 desc->ead_name, i);
1932 }
1933
1934 ena->ena_attach_seq = desc->ead_seq;
1935 }
1936
1937 /*
1938 * Now that interrupts are enabled, unmask the admin interrupt.
1939 * Note that this interrupt is generated for both the admin queue and
1940 * the AENQ, but this driver always polls the admin queue. The surplus
1941 * interrupt for admin command completion triggers a harmless check of
1942 * the AENQ.
1943 */
1944 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK);
1945 ena_aenq_enable(ena);
1946
1947 ddi_set_driver_private(dip, ena);
1948
1949 ena_update_regcache(ena);
1950
1951 atomic_or_32(&ena->ena_state, ENA_STATE_INITIALIZED);
1952
1953 return (DDI_SUCCESS);
1954 }
1955
1956 static int
ena_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)1957 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1958 {
1959 ena_t *ena = ddi_get_driver_private(dip);
1960
1961 if (ena == NULL) {
1962 return (DDI_FAILURE);
1963 }
1964
1965 /*
1966 * Before we can proceed to cleanup we have to treat
1967 * mac_unregister() explicitly -- if there are still
1968 * outstanding clients, then we can't proceed with detach or
1969 * cleanup.
1970 */
1971
1972 /*
1973 * Why this would fail I don't know, but if we proceed to mac
1974 * unregister, then there is a good chance we will panic in
1975 * the Rx interrupt handler when calling mac_rx_ring()
1976 */
1977 if (!ena_intrs_disable(ena)) {
1978 return (DDI_FAILURE);
1979 }
1980
1981 /* We can't detach if clients are actively using the device. */
1982 if (ena_mac_unregister(ena) != 0) {
1983 (void) ena_intrs_enable(ena);
1984 return (DDI_FAILURE);
1985 }
1986
1987 /*
1988 * At this point we can proceed with the rest of cleanup on a
1989 * best-effort basis.
1990 */
1991 ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC;
1992 ena_cleanup(ena);
1993 ddi_set_driver_private(dip, NULL);
1994 kmem_free(ena, sizeof (ena_t));
1995 return (DDI_SUCCESS);
1996 }
1997
1998 static struct cb_ops ena_cb_ops = {
1999 .cb_open = nodev,
2000 .cb_close = nodev,
2001 .cb_strategy = nodev,
2002 .cb_print = nodev,
2003 .cb_dump = nodev,
2004 .cb_read = nodev,
2005 .cb_write = nodev,
2006 .cb_ioctl = nodev,
2007 .cb_devmap = nodev,
2008 .cb_mmap = nodev,
2009 .cb_segmap = nodev,
2010 .cb_chpoll = nochpoll,
2011 .cb_prop_op = ddi_prop_op,
2012 .cb_flag = D_MP,
2013 .cb_rev = CB_REV,
2014 .cb_aread = nodev,
2015 .cb_awrite = nodev
2016 };
2017
2018 static struct dev_ops ena_dev_ops = {
2019 .devo_rev = DEVO_REV,
2020 .devo_refcnt = 0,
2021 .devo_getinfo = NULL,
2022 .devo_identify = nulldev,
2023 .devo_probe = nulldev,
2024 .devo_attach = ena_attach,
2025 .devo_detach = ena_detach,
2026 .devo_reset = nodev,
2027 .devo_quiesce = ddi_quiesce_not_supported,
2028 .devo_cb_ops = &ena_cb_ops
2029 };
2030
2031 static struct modldrv ena_modldrv = {
2032 .drv_modops = &mod_driverops,
2033 .drv_linkinfo = "AWS ENA Ethernet",
2034 .drv_dev_ops = &ena_dev_ops
2035 };
2036
2037 static struct modlinkage ena_modlinkage = {
2038 .ml_rev = MODREV_1,
2039 .ml_linkage = { &ena_modldrv, NULL }
2040 };
2041
2042 int
_init(void)2043 _init(void)
2044 {
2045 int ret;
2046
2047 mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME);
2048
2049 if ((ret = mod_install(&ena_modlinkage)) != 0) {
2050 mac_fini_ops(&ena_dev_ops);
2051 return (ret);
2052 }
2053
2054 return (ret);
2055 }
2056
2057 int
_info(struct modinfo * modinfop)2058 _info(struct modinfo *modinfop)
2059 {
2060 return (mod_info(&ena_modlinkage, modinfop));
2061 }
2062
2063 int
_fini(void)2064 _fini(void)
2065 {
2066 int ret;
2067
2068 if ((ret = mod_remove(&ena_modlinkage)) != 0) {
2069 return (ret);
2070 }
2071
2072 mac_fini_ops(&ena_dev_ops);
2073 return (ret);
2074 }
2075