1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 #include "ena_hw.h" 17 #include "ena.h" 18 19 /* 20 * Elastic Network Adapter (ENA) Driver 21 * ------------------------------------ 22 * 23 * The ena driver provides support for the AWS ENA device, also 24 * referred to as their "enhanced networking". This device is present 25 * on "Nitro"-based instances. It presents itself with the following 26 * PCI Vendor/Device IDs 27 * 28 * o 1d0f:0ec2 -- ENA PF 29 * o 1d0f:1ec2 -- ENA PF (Reserved) 30 * o 1d0f:ec20 -- ENA VF 31 * o 1d0f:ec21 -- ENA VF (Reserved) 32 * 33 * This driver provides support for only the essential features needed 34 * to drive traffic on an ENA device. Support for the following 35 * features IS NOT currently implemented. 36 * 37 * o Admin Queue Interrupts: queue completion events are always polled 38 * o FMA 39 * o Rx checksum offloads 40 * o Tx checksum offloads 41 * o Tx DMA bind (borrow buffers) 42 * o Rx DMA bind (loaned buffers) 43 * o TSO 44 * o RSS 45 * o Low Latency Queues (LLQ) 46 * o Support for different Tx completion policies 47 * o More controlled Tx recycling and Rx refill 48 * 49 * Even without these features the ena driver should perform 50 * reasonably well. 51 * 52 * Driver vs. Hardware Types 53 * ------------------------- 54 * 55 * To properly communicate with the ENA device the driver must 56 * populate memory (registers and buffers) with specific types. These 57 * types are defined by the device and are found under the "common" 58 * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have 59 * simplified this a bit by defining all device-specific types in the 60 * ena_hw.h file. Furthermore, all device-specific types are given an 61 * "enahw" prefix. This makes it clear when we are dealing with a 62 * device type and when we are dealing with a driver type. 63 * 64 * [1]: https://github.com/amzn/amzn-drivers 65 * 66 * Groups, Rings (Queues), and Interrupts 67 * -------------------------------------- 68 * 69 * The ENA device presents one mac group. This single mac group 70 * represents the single unicast address that this device represents 71 * in your AWS instance. The ENA device presents no option for 72 * configuring additional MAC addresses, multicast, or promisc mode -- 73 * you receive only what AWS wants you to receive. 74 * 75 * This single mac group may have one or more rings. The ENA driver 76 * refers to rings as queues, for no special reason other than it was 77 * the dominant language in the Linux and FreeBSD drivers, and it 78 * spilled over into this port. The upper bound on number of queues is 79 * presented by the device. However, we don't just go with whatever 80 * number of queues the device reports; but rather we limit the queues 81 * based on other factors such as an absolute maximum, number of 82 * online CPUs, and number of available interrupts. The upper bound is 83 * calculated by ena_set_max_io_queues(), and that is used and 84 * possibly further restricted in ena_attach_intr_alloc(). As this 85 * point, ultimately, it is the number of available interrupts (minus 86 * one for the admin queue) that determines the number of queues: one 87 * Tx and one Rx on each I/O interrupt. 88 * 89 * NOTE: Perhaps it is overly restrictive to limit the number of 90 * queues to the number of I/O interrupts. Something worth considering 91 * on larger instances if they present far less interrupts than they 92 * do queues + CPUs. 93 * 94 * The ENA device presents MSI-X interrupts only. During attach the 95 * driver queries the number of available interrupts and sets aside 96 * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N). 97 * This means that a Tx/Rx queue at index 0 will map to vector 1, and 98 * so on. 99 * 100 * NOTE: The ENA driver currently doesn't make full use of the Admin 101 * Queue interrupt. This interrupt is used both to notify the driver 102 * when a command response is ready, and when an async event is posted. 103 * The ENA driver always polls the Admin Queue for responses. 104 * 105 * Tx Queue Workings 106 * ----------------- 107 * 108 * A single Tx queue (ena_txq_t) is made up of one submission queue 109 * (SQ) and its paired completion queue (CQ). These two queues form a 110 * logical descriptor ring which is used to send packets out of the 111 * device -- where each SQ entry describes the packet to be sent 112 * (enahw_tx_desc_t) and each CQ entry describes the result of sending 113 * a packet (enahw_tx_cdesc_t). For this to work the host and device 114 * must agree on which descriptors are currently owned by the host 115 * (free for sending) and which are owned by the device (pending 116 * device completion). This state is tracked on the host side via head 117 * and tail indexes along with a phase value. 118 * 119 * The head and tail values represent the head and tail of the FIFO 120 * queue of pending packets -- the next packet to be sent by the 121 * device is head, and all descriptors up to tail are ready for 122 * sending. The phase allows the host to determine which CQ 123 * descriptors represent completed events when using per-SQ completion 124 * events (as opposed to queue head pointer updates). As the queues 125 * represent a logical ring buffer, the phase must alternate on 126 * wrap-around. The device initializes the phase to zero, and the host 127 * starts with a phase of 1. The first packet descriptor writes, and 128 * their corresponding completions, are indicated with a phase of 1. 129 * 130 * 131 * For example, the diagram below represents the SQ/CQ state after the 132 * first 6 packets have been sent by the host and 2 of them have been 133 * completed by the device (and these completions have been processed 134 * by the driver). In this state the host could send 4 more packets 135 * before needing to wait on completion events. 136 * 137 * 138 * +---+---+---+---+---+---+---+---+ 139 * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1 140 * +---+---+---+---+---+---+---+---+ 141 * ^ 142 * | 143 * tail 144 * head 145 * | 146 * v 147 * +---+---+---+---+---+---+---+---+ 148 * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1 149 * +---+---+---+---+---+---+---+---+ 150 * 151 * 152 * The next diagram shows how the state changes as 5 more packets are 153 * sent (for a total of 11) and 7 more are completed (for a total of 154 * 9). Notice that as the SQ and CQ have wrapped around their phases 155 * have been complemented. In this state the host could send 6 more 156 * packets before needing to wait on completion events. 157 * 158 * +---+---+---+---+---+---+---+---+ 159 * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0 160 * +---+---+---+---+---+---+---+---+ 161 * ^ 162 * | 163 * tail 164 * head 165 * | 166 * v 167 * +---+---+---+---+---+---+---+---+ 168 * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0 169 * +---+---+---+---+---+---+---+---+ 170 * 171 * 172 * Currently, all packets are copied for Tx. At ring start we allocate 173 * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has 174 * DMA buffer associated with it; and each buffer is large enough to 175 * hold the MTU. Therefore, Tx descriptors and TCBs currently have a 176 * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to 177 * the TCB's DMA buffer, and a new descriptor is written to the SQ 178 * describing said TCB buffer. If and when we add more advanced 179 * features like DMA binding of mblks and TSO, this 1:1 guarantee will 180 * no longer hold. 181 * 182 * Rx Queue Workings 183 * ----------------- 184 * 185 * In terms of implementing the logical descriptor ring, the Rx queues 186 * are very much like the Tx queues. There is a paired SQ and CQ for 187 * each logical ring. The difference is that in Rx the SQ is for 188 * handing buffers to the device to fill, and the CQ is for describing 189 * the contents of those buffers for a given received frame. At Rx 190 * ring start we allocate a Rx Control Buffer (RCB) for each 191 * descriptor in the ring. Each RCB has a DMA buffer associated with 192 * it; and each buffer is large enough to hold the MTU. For each 193 * received frame we copy the contents out of the RCB and into its own 194 * mblk, immediately returning the RCB for reuse. As with Tx, this 195 * gives us a simple 1:1 mapping currently, but if more advanced 196 * features are implemented later this could change. 197 * 198 * Asynchronous Event Notification Queue (AENQ) 199 * -------------------------------------------- 200 * 201 * Each ENA device comes with a mechanism for sending out-of-band 202 * notifications to the driver. This includes events like link state 203 * changes, fatal errors, and a watchdog/keep alive signal. The AENQ 204 * delivery mechanism is via interrupt, handled by the ena_aenq_work() 205 * function, which dispatches via the eaenq_hdlrs table. If no handler 206 * is registered, the ena_aenq_default_hdlr() handler is used. A given 207 * device may not support all the different event types 208 * (enahw_aenq_groups_t); and the driver may choose to enable a subset 209 * of the supported events. During attach we call ena_aenq_configure() 210 * to negotiate the supported/enabled events. The enabled group is 211 * stored at ena_aenq_enabled_groups. 212 * 213 * Queues and Unsigned Wraparound 214 * ------------------------------ 215 * 216 * All the queues use a uint16_t value as their head/tail values, e.g. 217 * the Rx queue's er_cq_head_idx value. You might notice that we only 218 * ever increment these values, letting them perform implicit unsigned 219 * integer wraparound. This is intended. This is the same behavior as 220 * the common code, and seems to be what the hardware expects. Of 221 * course, when accessing our own descriptor arrays we must make sure 222 * to first perform a modulo of this value or risk running off into 223 * space. 224 * 225 * Watchdog and Device Reset 226 * ------------------------- 227 * 228 * While the device is running, the driver periodically invokes a 229 * watchdog function to check that all is well, and to reset the 230 * device if not. The device will be reset if any of the following is 231 * true: 232 * 233 * o The device's status register fatal error bit is set. A device 234 * in this state will no longer process any queues; 235 * o No asynchronous event keepalives have been received for some 236 * time -- see ENA_DEVICE_KEEPALIVE_TIMEOUT_NS; 237 * o A Tx queue has remained blocked for some time -- see 238 * ENA_TX_STALL_TIMEOUT; 239 * o The device has requested, via an asynchronous event, that we 240 * perform a reset; 241 * o Driver code has detected an error and set the EN_STATE_ERROR 242 * bit in ena_state. 243 * 244 * There is a "fatal error" asynchronous event, but common code does 245 * not use that as a reason to trigger a reset, and so neither do we. 246 * 247 * The global `ena_force_reset` variable can be used as a simple means 248 * to trigger a reset during driver development and testing. If there 249 * are multiple instances, it is likely that only one of them will 250 * reset when this variable is changed to `true`. 251 * 252 * Attach Sequencing 253 * ----------------- 254 * 255 * Most drivers implement their attach/detach/cleanup functions as a 256 * sequential stream of function calls used to allocate and initialize 257 * resources in an order determined by the device's programming manual 258 * combined with any requirements imposed by the kernel and its 259 * relevant modules. These functions can become quite long. It is 260 * often hard to see the order in which steps are taken, and even 261 * harder to tell if detach/cleanup undoes them in the correct order, 262 * or even if it undoes them at all! The only sure way to understand 263 * the flow is to take good notes while closely inspecting each line 264 * of code. Even then, it's easy for attach and detach to get out of 265 * sync. 266 * 267 * Some more recent drivers have improved on this situation by using a 268 * bit vector to track the sequence of events in attach/detach. Each 269 * bit is declared in as an enum value, in the same order it is 270 * expected attach would run, and thus detach would run in the exact 271 * opposite order. This has three main benefits: 272 * 273 * 1. It makes it easier to determine sequence order at a 274 * glance. 275 * 276 * 2. It gives a better idea of what state the device is in during 277 * debugging (the sequence bit vector is kept with the instance 278 * state). 279 * 280 * 3. The detach function can verify that all sequence bits are 281 * cleared, indicating that everything done in attach was 282 * successfully undone. 283 * 284 * These are great improvements. However, the attach/detach functions 285 * can still become unruly, and there is still no guarantee that 286 * detach is done in opposite order of attach (this is not always 287 * strictly required, but is probably the best way to write detach). 288 * There is still a lot of boilerplate and chance for programmer 289 * error. 290 * 291 * The ena driver takes the sequence idea a bit further, creating a 292 * descriptor table of the attach sequence (ena_attach_tbl). This 293 * table is used by attach/detach to generically, declaratively, and 294 * programmatically enforce the precise sequence order and verify that 295 * anything that is done is undone. This provides several benefits: 296 * 297 * o Correct order is enforced implicitly by the descriptor table. 298 * It is impossible for the detach sequence to run in any other 299 * order other than opposite that of attach. 300 * 301 * o It is obvious what the precise attach sequence is. While the 302 * bit vector enum helps a lot with this it doesn't prevent 303 * programmer error. With the sequence defined as a declarative 304 * table it makes it easy for the programmer to see the order and 305 * know it's followed exactly. 306 * 307 * o It is impossible to modify the attach sequence without also 308 * specifying a callback for its dual in the detach sequence. 309 * 310 * o Common and repetitive code like error checking, logging, and bit 311 * vector modification is eliminated and centralized, again 312 * reducing the chance of programmer error. 313 * 314 * The ena attach sequence is defined under ena_attach_seq_t. The 315 * descriptor table is defined under ena_attach_tbl. 316 */ 317 318 /* 319 * These are some basic data layout invariants on which development 320 * assumptions where made. 321 */ 322 CTASSERT(sizeof (enahw_tx_data_desc_t) == 16); 323 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t)); 324 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t)); 325 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t)); 326 327 /* 328 * Amazon does not specify the endianess of the ENA device. We assume 329 * it's the same as the bus, and we assume the CPU/bus is always 330 * little endian. 331 */ 332 #ifdef _BIG_ENDIAN 333 #error "ENA driver is little-endian only" 334 #endif 335 336 /* 337 * These values are used to communicate the driver version to the AWS 338 * hypervisor via the ena_set_host_info() function. We don't know what 339 * exactly AWS does with this info, but it's fairly safe to assume 340 * it's used solely for debug/informational purposes. The Linux driver 341 * updates these values frequently as bugs are fixed and features are 342 * added. 343 */ 344 #define ENA_DRV_VER_MAJOR 1 345 #define ENA_DRV_VER_MINOR 0 346 #define ENA_DRV_VER_SUBMINOR 0 347 348 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS; 349 350 /* 351 * Log an error message. We leave the destination (console or system 352 * log) up to the caller 353 */ 354 void 355 ena_err(const ena_t *ena, const char *fmt, ...) 356 { 357 va_list ap; 358 359 va_start(ap, fmt); 360 if (ena != NULL && ena->ena_dip != NULL) { 361 vdev_err(ena->ena_dip, CE_WARN, fmt, ap); 362 } else { 363 vcmn_err(CE_WARN, fmt, ap); 364 } 365 va_end(ap); 366 } 367 368 void 369 ena_panic(const ena_t *ena, const char *fmt, ...) 370 { 371 va_list ap; 372 373 va_start(ap, fmt); 374 if (ena != NULL && ena->ena_dip != NULL) { 375 vdev_err(ena->ena_dip, CE_PANIC, fmt, ap); 376 } else { 377 vcmn_err(CE_PANIC, fmt, ap); 378 } 379 va_end(ap); 380 } 381 382 /* 383 * Set this to true to enable debug messages. 384 */ 385 bool ena_debug = false; 386 387 /* 388 * Log a debug message. We force all debug messages to go to the 389 * system log. 390 */ 391 void 392 ena_dbg(const ena_t *ena, const char *fmt, ...) 393 { 394 va_list ap; 395 396 if (ena_debug) { 397 char msg[1024]; 398 399 va_start(ap, fmt); 400 (void) vsnprintf(msg, sizeof (msg), fmt, ap); 401 va_end(ap); 402 403 if (ena != NULL && ena->ena_dip != NULL) { 404 dev_err(ena->ena_dip, CE_NOTE, "!%s", msg); 405 } else { 406 cmn_err(CE_NOTE, "!%s", msg); 407 } 408 } 409 } 410 411 void 412 ena_trigger_reset(ena_t *ena, enahw_reset_reason_t reason) 413 { 414 mutex_enter(&ena->ena_lock); 415 ena->ena_reset_reason = reason; 416 mutex_exit(&ena->ena_lock); 417 atomic_or_32(&ena->ena_state, ENA_STATE_ERROR); 418 } 419 420 /* 421 * Determine if a given feature is available on this device. 422 */ 423 bool 424 ena_is_feat_avail(ena_t *ena, const enahw_feature_id_t feat_id) 425 { 426 VERIFY3U(feat_id, <=, ENAHW_FEAT_NUM); 427 uint32_t mask = 1U << feat_id; 428 429 /* 430 * The device attributes feature is always supported, as 431 * indicated by the common code. 432 */ 433 if (feat_id == ENAHW_FEAT_DEVICE_ATTRIBUTES) 434 return (true); 435 436 return ((ena->ena_supported_features & mask) != 0); 437 } 438 439 /* 440 * Determine if a given capability is available on this device. 441 */ 442 bool 443 ena_is_cap_avail(ena_t *ena, const enahw_capability_id_t cap_id) 444 { 445 VERIFY3U(cap_id, <=, ENAHW_CAP_NUM); 446 uint32_t mask = 1U << cap_id; 447 448 return ((ena->ena_capabilities & mask) != 0); 449 } 450 451 static bool 452 ena_device_reset(ena_t *ena, enum enahw_reset_reason_types reason) 453 { 454 uint32_t rval, wval, reason_lsb, reason_msb; 455 hrtime_t timeout, expired; 456 457 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 458 if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) { 459 ena_err(ena, "reset: device is not ready"); 460 return (false); 461 } 462 463 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 464 465 /* 466 * The device stores the reset timeout at 100ms resolution; we 467 * normalize that to nanoseconds. 468 */ 469 timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100); 470 471 if (timeout == 0) { 472 ena_err(ena, "device gave invalid (0) reset timeout"); 473 return (false); 474 } 475 476 expired = gethrtime() + timeout; 477 478 wval = ENAHW_DEV_CTL_DEV_RESET_MASK; 479 480 reason_lsb = ENAHW_RESET_REASON_LSB(reason); 481 reason_msb = ENAHW_RESET_REASON_MSB(reason); 482 483 wval |= (reason_lsb << ENAHW_DEV_CTL_RESET_REASON_SHIFT) & 484 ENAHW_DEV_CTL_RESET_REASON_MASK; 485 if (ena_is_cap_avail(ena, ENAHW_CAP_EXTENDED_RESET_REASONS)) { 486 wval |= (reason_msb << ENAHW_DEV_CTL_RESET_REASON_EXT_SHIFT) & 487 ENAHW_DEV_CTL_RESET_REASON_EXT_MASK; 488 } else if (reason_msb != 0) { 489 /* Fall back to "generic" which we know will fit */ 490 wval = ENAHW_DEV_CTL_DEV_RESET_MASK; 491 wval |= (ENAHW_RESET_GENERIC << 492 ENAHW_DEV_CTL_RESET_REASON_SHIFT) & 493 ENAHW_DEV_CTL_RESET_REASON_MASK; 494 } 495 496 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval); 497 498 /* 499 * Make sure reset is in progress. 500 */ 501 for (;;) { 502 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 503 504 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) 505 break; 506 507 if (gethrtime() > expired) { 508 ena_err(ena, "device reset start timed out"); 509 return (false); 510 } 511 512 /* Sleep for 100 milliseconds. */ 513 delay(drv_usectohz(100 * 1000)); 514 } 515 516 /* 517 * Reset the timeout counter for the next device request. 518 */ 519 expired = gethrtime() + timeout; 520 521 /* 522 * Wait for the device reset to finish. 523 */ 524 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0); 525 for (;;) { 526 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 527 528 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) { 529 break; 530 } 531 532 if (gethrtime() > expired) { 533 ena_err(ena, "device reset timed out"); 534 return (false); 535 } 536 537 /* Sleep for 100 milliseconds. */ 538 delay(drv_usectohz(100 * 1000)); 539 } 540 541 ena_dbg(ena, "device reset succeeded"); 542 543 return (true); 544 } 545 546 static bool 547 ena_attach_pci(ena_t *ena) 548 { 549 ddi_acc_handle_t hdl; 550 551 if (pci_config_setup(ena->ena_dip, &hdl) != 0) { 552 return (false); 553 } 554 555 ena->ena_pci_hdl = hdl; 556 ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID); 557 ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID); 558 ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID); 559 ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID); 560 ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID); 561 ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x", 562 ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev, 563 ena->ena_pci_svid, ena->ena_pci_sdid); 564 565 return (true); 566 } 567 568 static void 569 ena_cleanup_pci(ena_t *ena, bool resetting) 570 { 571 VERIFY0(resetting); 572 pci_config_teardown(&ena->ena_pci_hdl); 573 } 574 575 static void 576 ena_cleanup_regs_map(ena_t *ena, bool resetting) 577 { 578 VERIFY0(resetting); 579 ddi_regs_map_free(&ena->ena_reg_hdl); 580 } 581 582 static bool 583 ena_attach_regs_map(ena_t *ena) 584 { 585 int ret = 0; 586 587 if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) != 588 DDI_SUCCESS) { 589 ena_err(ena, "failed to get register set %d size", 590 ENA_REG_NUMBER); 591 return (false); 592 } 593 594 ena_dbg(ena, "register size: %ld", ena->ena_reg_size); 595 bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr)); 596 ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1; 597 ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC; 598 ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 599 600 /* 601 * This function can return several different failure values, 602 * so we make sure to capture its return value for the purpose 603 * of logging. 604 */ 605 ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER, 606 &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr, 607 &ena->ena_reg_hdl); 608 609 if (ret != DDI_SUCCESS) { 610 ena_err(ena, "failed to map register set %d: %d", 611 ENA_REG_NUMBER, ret); 612 return (false); 613 } 614 615 ena_dbg(ena, "registers mapped to base: 0x%p", 616 (void *)ena->ena_reg_base); 617 618 return (true); 619 } 620 621 /* 622 * Free any resources related to the admin submission queue. 623 */ 624 static void 625 ena_admin_sq_free(ena_t *ena) 626 { 627 ena_dma_free(&ena->ena_aq.ea_sq.eas_dma); 628 } 629 630 /* 631 * Initialize the admin submission queue. 632 */ 633 static bool 634 ena_admin_sq_init(ena_t *ena) 635 { 636 ena_adminq_t *aq = &ena->ena_aq; 637 ena_dma_buf_t *dma = &aq->ea_sq.eas_dma; 638 size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries); 639 uint32_t addr_low, addr_high, wval; 640 641 if (aq->ea_sq.eas_entries == NULL) { 642 ena_dma_conf_t conf = { 643 .edc_size = size, 644 .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT, 645 .edc_sgl = 1, 646 .edc_endian = DDI_NEVERSWAP_ACC, 647 .edc_stream = false, 648 }; 649 650 if (!ena_dma_alloc(ena, dma, &conf, size)) { 651 ena_err(ena, "failed to allocate DMA for Admin SQ"); 652 return (false); 653 } 654 655 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 656 aq->ea_sq.eas_entries = (void *)dma->edb_va; 657 } else { 658 ena_dma_bzero(dma); 659 } 660 661 aq->ea_sq.eas_tail = 0; 662 aq->ea_sq.eas_phase = 1; 663 aq->ea_sq.eas_dbaddr = 664 (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB); 665 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 666 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 667 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low); 668 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high); 669 wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) | 670 ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries)); 671 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval); 672 673 return (true); 674 } 675 676 /* 677 * Free any resources related to the admin completion queue. 678 */ 679 static void 680 ena_admin_cq_free(ena_t *ena) 681 { 682 ena_dma_free(&ena->ena_aq.ea_cq.eac_dma); 683 } 684 685 /* 686 * Initialize the admin completion queue. 687 */ 688 static bool 689 ena_admin_cq_init(ena_t *ena) 690 { 691 ena_adminq_t *aq = &ena->ena_aq; 692 ena_dma_buf_t *dma = &aq->ea_cq.eac_dma; 693 uint32_t addr_low, addr_high, wval; 694 695 if (aq->ea_cq.eac_entries == NULL) { 696 size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries); 697 ena_dma_conf_t conf = { 698 .edc_size = size, 699 .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT, 700 .edc_sgl = 1, 701 .edc_endian = DDI_NEVERSWAP_ACC, 702 .edc_stream = false, 703 }; 704 705 if (!ena_dma_alloc(ena, dma, &conf, size)) { 706 ena_err(ena, "failed to allocate DMA for Admin CQ"); 707 return (false); 708 } 709 710 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 711 aq->ea_cq.eac_entries = (void *)dma->edb_va; 712 } else { 713 ena_dma_bzero(dma); 714 } 715 716 aq->ea_cq.eac_head = 0; 717 aq->ea_cq.eac_phase = 1; 718 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 719 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 720 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low); 721 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high); 722 wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) | 723 ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries)); 724 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval); 725 726 return (true); 727 } 728 729 void 730 ena_update_hints(ena_t *ena, enahw_device_hints_t *hints) 731 { 732 ena->ena_device_hints.eh_mmio_read_timeout = 733 hints->edh_mmio_read_timeout; 734 ena->ena_device_hints.eh_keep_alive_timeout = 735 hints->edh_keep_alive_timeout; 736 ena->ena_device_hints.eh_tx_comp_timeout = hints->edh_tx_comp_timeout; 737 ena->ena_device_hints.eh_missed_tx_reset_threshold = 738 hints->edh_missed_tx_reset_threshold; 739 ena->ena_device_hints.eh_admin_comp_timeout = 740 hints->edh_admin_comp_timeout; 741 ena->ena_device_hints.eh_max_tx_sgl = hints->edh_max_tx_sgl; 742 ena->ena_device_hints.eh_max_rx_sgl = hints->edh_max_rx_sgl; 743 } 744 745 /* 746 * We limit the max number of I/O queues based on several aspects of 747 * the underlying hardware. 748 * 749 * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES, 750 * which comes from the common code and presumably is based on device 751 * constraints. 752 * 753 * 2. Next we latch the number of I/O queues to the number of online 754 * CPUs. The idea being that each queue is a parallel work stream, 755 * and having more queues than CPUs to flush them will not improve 756 * performance. The number of online CPUs can change dynamically, 757 * and that's okay, everything should still work fine, it just 758 * might not be ideal. 759 * 760 * 3. Next we latch the number of I/O queues to the smallest of the 761 * max Tx queues and max Rx queues. We could probably loosen this 762 * restriction in the future, and have separate max I/O queues for 763 * Tx and Rx. This is what Linux does, and seems like a fine place 764 * to start. 765 */ 766 static void 767 ena_set_max_io_queues(ena_t *ena) 768 { 769 uint32_t max = ENAHW_MAX_NUM_IO_QUEUES; 770 771 max = MIN(ncpus_online, max); 772 /* 773 * Supposedly a device could present a different number of SQs 774 * and CQs. This driver is designed in a way that requires 775 * each SQ to have a corresponding and dedicated CQ (how would 776 * it work otherwise). Therefore, we must check both values 777 * and find the minimum between them. 778 */ 779 max = MIN(ena->ena_tx_max_sq_num, max); 780 max = MIN(ena->ena_tx_max_cq_num, max); 781 max = MIN(ena->ena_rx_max_sq_num, max); 782 max = MIN(ena->ena_rx_max_cq_num, max); 783 784 785 /* This shouldn't happen, but just in case. */ 786 if (max == 0) { 787 max = 1; 788 } 789 790 ena->ena_max_io_queues = max; 791 } 792 793 /* 794 * We require that an Rx or Tx buffer be able to hold the maximum MTU 795 * along with the maximum frame header length. In this case we know 796 * ENA is presenting us an Ethernet frame so we add the size of an 797 * Ethernet VLAN header. Rx has the additional requirement of needing 798 * additional margin for the sake of IP header alignment. 799 */ 800 static void 801 ena_update_buf_sizes(ena_t *ena) 802 { 803 ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header); 804 ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu; 805 ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total, 806 ena->ena_page_sz, uint32_t); 807 ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total + 808 ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t); 809 } 810 811 static bool 812 ena_get_hints(ena_t *ena) 813 { 814 int ret; 815 enahw_resp_desc_t resp; 816 enahw_device_hints_t *hints = &resp.erd_resp.erd_get_feat.ergf_hints; 817 818 ena_dbg(ena, "Requesting hints"); 819 820 bzero(&resp, sizeof (resp)); 821 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_HW_HINTS, 822 ENAHW_FEAT_HW_HINTS_VER); 823 824 if (ret == ENOTSUP) { 825 /* In this case the device does not support querying hints */ 826 ena_dbg(ena, "Hints are unsupported"); 827 return (true); 828 } else if (ret != 0) { 829 ena_err(ena, "Error getting hints: %d", ret); 830 return (false); 831 } 832 833 ena_update_hints(ena, hints); 834 835 return (true); 836 } 837 838 static bool 839 ena_get_offloads(ena_t *ena) 840 { 841 int ret = 0; 842 enahw_resp_desc_t resp; 843 enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload; 844 845 ena->ena_tx_l3_ipv4_csum = false; 846 847 ena->ena_tx_l4_ipv4_part_csum = false; 848 ena->ena_tx_l4_ipv4_full_csum = false; 849 ena->ena_tx_l4_ipv4_lso = false; 850 851 ena->ena_tx_l4_ipv6_part_csum = false; 852 ena->ena_tx_l4_ipv6_full_csum = false; 853 ena->ena_tx_l4_ipv6_lso = false; 854 855 ena->ena_rx_l3_ipv4_csum = false; 856 ena->ena_rx_l4_ipv4_csum = false; 857 ena->ena_rx_l4_ipv6_csum = false; 858 ena->ena_rx_hash = false; 859 860 bzero(&resp, sizeof (resp)); 861 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG, 862 ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER); 863 864 if (ret == ENOTSUP) { 865 /* 866 * In this case the device does not support querying 867 * for hardware offloads. We take that as a sign that 868 * the device provides no offloads. 869 */ 870 return (true); 871 } else if (ret != 0) { 872 ena_err(ena, "error getting stateless offload: %d", ret); 873 return (false); 874 } 875 876 ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat); 877 878 ena->ena_tx_l4_ipv4_part_csum = 879 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat); 880 ena->ena_tx_l4_ipv4_full_csum = 881 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat); 882 ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat); 883 884 ena->ena_tx_l4_ipv6_part_csum = 885 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat); 886 ena->ena_tx_l4_ipv6_full_csum = 887 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat); 888 ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat); 889 890 ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat); 891 ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat); 892 ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat); 893 return (true); 894 } 895 896 static int 897 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval, 898 const int defval) 899 { 900 int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip, 901 DDI_PROP_DONTPASS, propname, defval); 902 903 if (value > maxval) { 904 ena_err(ena, "user value %s=%d exceeded maximum, setting to %d", 905 propname, value, maxval); 906 value = maxval; 907 } 908 909 if (value < minval) { 910 ena_err(ena, "user value %s=%d below minimum, setting to %d", 911 propname, value, minval); 912 value = minval; 913 } 914 915 return (value); 916 } 917 918 static bool 919 ena_set_mtu(ena_t *ena) 920 { 921 int ret = 0; 922 enahw_cmd_desc_t cmd; 923 enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu; 924 enahw_resp_desc_t resp; 925 926 bzero(&cmd, sizeof (cmd)); 927 bzero(&resp, sizeof (resp)); 928 feat->efm_mtu = ena->ena_mtu; 929 930 if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU, 931 ENAHW_FEAT_MTU_VER)) != 0) { 932 ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu, 933 ret); 934 return (false); 935 } 936 937 return (true); 938 } 939 940 static void 941 ena_get_link_config(ena_t *ena) 942 { 943 enahw_resp_desc_t resp; 944 enahw_feat_link_conf_t *feat = 945 &resp.erd_resp.erd_get_feat.ergf_link_conf; 946 bool full_duplex; 947 948 bzero(&resp, sizeof (resp)); 949 950 if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG, 951 ENAHW_FEAT_LINK_CONFIG_VER) != 0) { 952 /* 953 * Some ENA devices do no support this feature. In 954 * those cases we report a 1Gbps link, full duplex. 955 * For the most accurate information on bandwidth 956 * limits see the official AWS documentation. 957 */ 958 ena->ena_link_speed_mbits = 1000; 959 ena->ena_link_speeds = ENAHW_LINK_SPEED_1G; 960 ena->ena_link_duplex = LINK_DUPLEX_FULL; 961 ena->ena_link_autoneg = true; 962 return; 963 } 964 965 ena->ena_link_speed_mbits = feat->eflc_speed; 966 ena->ena_link_speeds = feat->eflc_supported; 967 full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat); 968 ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL : 969 LINK_DUPLEX_HALF; 970 ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat); 971 } 972 973 /* 974 * Retrieve all configuration values which are modifiable via 975 * ena.conf, and set ena_t members accordingly. While the conf values 976 * have priority, they may be implicitly modified by the driver to 977 * meet resource constraints on a given platform. If no value is 978 * specified in the conf file, the driver will attempt to use the 979 * largest value supported. While there should be no value large 980 * enough, keep in mind that ena_get_prop() will cast the values to an 981 * int. 982 * 983 * This function should be called after the device is initialized, 984 * admin queue is established, and the hardware features/capabs have 985 * been queried; it should be called before mac registration. 986 */ 987 static bool 988 ena_attach_read_conf(ena_t *ena) 989 { 990 uint32_t gcv; /* Greatest Common Value */ 991 992 /* 993 * We expect that the queue lengths are the same for both the 994 * CQ and SQ, but technically the device could return 995 * different lengths. For now the driver locks them together. 996 */ 997 gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs); 998 ASSERT3U(gcv, <=, INT_MAX); 999 ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS, 1000 ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv); 1001 1002 ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT, 1003 ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX, 1004 ENA_PROP_RXQ_INTR_LIMIT_DEF); 1005 1006 gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs); 1007 ASSERT3U(gcv, <=, INT_MAX); 1008 ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS, 1009 ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv); 1010 1011 return (true); 1012 } 1013 1014 /* 1015 * Perform any necessary device configuration after the driver.conf 1016 * has been read. 1017 */ 1018 static bool 1019 ena_attach_dev_cfg(ena_t *ena) 1020 { 1021 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF); 1022 1023 if (!ena_set_mtu(ena)) { 1024 /* 1025 * We don't expect this to fail, but we try a fallback 1026 * first before failing the attach sequence. 1027 */ 1028 ena->ena_mtu = 1500; 1029 ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu); 1030 1031 if (!ena_set_mtu(ena)) { 1032 return (false); 1033 } 1034 } 1035 1036 return (true); 1037 } 1038 1039 static bool 1040 ena_check_versions(ena_t *ena) 1041 { 1042 uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION); 1043 uint32_t ctrl_vsn = 1044 ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION); 1045 1046 ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn); 1047 ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn); 1048 1049 ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn); 1050 ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn); 1051 ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn); 1052 ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn); 1053 1054 ena_dbg(ena, "device version: %u.%u", 1055 ena->ena_dev_major_vsn, ena->ena_dev_minor_vsn); 1056 ena_dbg(ena, "controller version: %u.%u.%u implementation %u", 1057 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, 1058 ena->ena_ctrl_subminor_vsn, ena->ena_ctrl_impl_id); 1059 1060 if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) { 1061 ena_err(ena, "unsupported controller version: %u.%u.%u", 1062 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, 1063 ena->ena_ctrl_subminor_vsn); 1064 return (false); 1065 } 1066 1067 return (true); 1068 } 1069 1070 static bool 1071 ena_adminq_init(ena_t *ena) 1072 { 1073 ena_adminq_t *aq = &ena->ena_aq; 1074 1075 /* 1076 * As we are not using an interrupt for admin queue completion 1077 * signaling, we do not need a priority on these mutexes. If 1078 * that changes, we will have to rejigger some code to create 1079 * the admin queue interrupt before this function. 1080 */ 1081 mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL); 1082 mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL); 1083 mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL); 1084 aq->ea_qlen = ENA_ADMINQ_DEPTH; 1085 aq->ea_pending_cmds = 0; 1086 1087 aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen, 1088 KM_SLEEP); 1089 list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t), 1090 offsetof(ena_cmd_ctx_t, ectx_node)); 1091 list_create(&aq->ea_cmd_ctxs_used, sizeof (ena_cmd_ctx_t), 1092 offsetof(ena_cmd_ctx_t, ectx_node)); 1093 1094 ena_create_cmd_ctx(ena); 1095 1096 /* 1097 * Start in polling mode until we've determined the number of queues 1098 * and are ready to configure and enable interrupts. 1099 */ 1100 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK); 1101 aq->ea_poll_mode = true; 1102 1103 return (true); 1104 } 1105 1106 /* 1107 * Free all resources allocated as part of ena_device_init(). 1108 */ 1109 static void 1110 ena_cleanup_device_init(ena_t *ena, bool resetting) 1111 { 1112 ena_adminq_t *aq = &ena->ena_aq; 1113 1114 VERIFY0(resetting); 1115 1116 ena_free_host_info(ena); 1117 mutex_destroy(&aq->ea_sq_lock); 1118 mutex_destroy(&aq->ea_cq_lock); 1119 mutex_destroy(&aq->ea_stat_lock); 1120 list_destroy(&aq->ea_cmd_ctxs_free); 1121 list_destroy(&aq->ea_cmd_ctxs_used); 1122 kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen); 1123 ena_admin_sq_free(ena); 1124 ena_admin_cq_free(ena); 1125 ena_aenq_free(ena); 1126 ena_stat_device_cleanup(ena); 1127 ena_stat_device_basic_cleanup(ena); 1128 ena_stat_device_extended_cleanup(ena); 1129 ena_stat_aenq_cleanup(ena); 1130 } 1131 1132 static bool 1133 ena_attach_device_init(ena_t *ena) 1134 { 1135 ena_adminq_t *aq = &ena->ena_aq; 1136 uint32_t rval; 1137 uint8_t dma_width; 1138 hrtime_t cmd_timeout; 1139 enahw_resp_desc_t resp; 1140 enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr; 1141 uint8_t *maddr; 1142 uint32_t supported_features; 1143 int ret = 0; 1144 1145 ena->ena_reset_reason = ENAHW_RESET_NORMAL; 1146 if (!ena_device_reset(ena, ena->ena_reset_reason)) 1147 return (false); 1148 1149 if (!ena_check_versions(ena)) 1150 return (false); 1151 1152 ena_init_regcache(ena); 1153 1154 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 1155 dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval); 1156 ena->ena_dma_width = dma_width; 1157 1158 /* 1159 * The value stored in the device register is in the 1160 * resolution of 100 milliseconds. We normalize that to 1161 * nanoseconds. 1162 */ 1163 cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100); 1164 aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns); 1165 1166 if (aq->ea_cmd_timeout_ns == 0) 1167 aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT_NS; 1168 1169 if (!ena_adminq_init(ena)) 1170 return (false); 1171 1172 if (!ena_admin_sq_init(ena)) 1173 return (false); 1174 1175 if (!ena_admin_cq_init(ena)) 1176 return (false); 1177 1178 if (!ena_aenq_init(ena)) 1179 return (false); 1180 1181 bzero(&resp, sizeof (resp)); 1182 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES, 1183 ENAHW_FEAT_DEVICE_ATTRIBUTES_VER); 1184 1185 if (ret != 0) { 1186 ena_err(ena, "failed to get device attributes: %d", ret); 1187 return (false); 1188 } 1189 1190 ena_dbg(ena, "impl ID: %u", feat->efda_impl_id); 1191 ena_dbg(ena, "device version: %u", feat->efda_device_version); 1192 ena_dbg(ena, "supported features: 0x%x", 1193 feat->efda_supported_features); 1194 ena_dbg(ena, "device capabilities: 0x%x", feat->efda_capabilities); 1195 ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width); 1196 ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with); 1197 maddr = feat->efda_mac_addr; 1198 ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1], 1199 maddr[2], maddr[3], maddr[4], maddr[5]); 1200 ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu); 1201 1202 bcopy(maddr, ena->ena_mac_addr, ETHERADDRL); 1203 ena->ena_max_mtu = feat->efda_max_mtu; 1204 ena->ena_capabilities = feat->efda_capabilities; 1205 supported_features = feat->efda_supported_features; 1206 ena->ena_supported_features = supported_features; 1207 feat = NULL; 1208 bzero(&resp, sizeof (resp)); 1209 1210 if (ena_is_feat_avail(ena, ENAHW_FEAT_MAX_QUEUES_EXT)) { 1211 enahw_feat_max_queue_ext_t *feat_mqe = 1212 &resp.erd_resp.erd_get_feat.ergf_max_queue_ext; 1213 1214 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT, 1215 ENAHW_FEAT_MAX_QUEUES_EXT_VER); 1216 1217 if (ret != 0) { 1218 ena_err(ena, "failed to query max queues ext: %d", ret); 1219 return (false); 1220 } 1221 1222 ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num; 1223 ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth; 1224 ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num; 1225 ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth; 1226 ena->ena_tx_max_desc_per_pkt = 1227 feat_mqe->efmqe_max_per_packet_tx_descs; 1228 ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size; 1229 1230 ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num; 1231 ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth; 1232 ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num; 1233 ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth; 1234 ena->ena_rx_max_desc_per_pkt = 1235 feat_mqe->efmqe_max_per_packet_rx_descs; 1236 1237 ena_set_max_io_queues(ena); 1238 } else { 1239 enahw_feat_max_queue_t *feat_mq = 1240 &resp.erd_resp.erd_get_feat.ergf_max_queue; 1241 1242 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM, 1243 ENAHW_FEAT_MAX_QUEUES_NUM_VER); 1244 1245 if (ret != 0) { 1246 ena_err(ena, "failed to query max queues: %d", ret); 1247 return (false); 1248 } 1249 1250 ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num; 1251 ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1252 ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num; 1253 ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1254 ena->ena_tx_max_desc_per_pkt = 1255 feat_mq->efmq_max_per_packet_tx_descs; 1256 ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size; 1257 1258 ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num; 1259 ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1260 ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num; 1261 ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1262 ena->ena_rx_max_desc_per_pkt = 1263 feat_mq->efmq_max_per_packet_rx_descs; 1264 1265 ena_set_max_io_queues(ena); 1266 } 1267 1268 ena->ena_mtu = ena->ena_max_mtu; 1269 ena_update_buf_sizes(ena); 1270 1271 if (!ena_get_hints(ena)) 1272 return (false); 1273 1274 ena->ena_tx_sgl_max_sz = 1; 1275 ena->ena_rx_sgl_max_sz = 1; 1276 if (ena->ena_device_hints.eh_max_tx_sgl != 0) 1277 ena->ena_tx_sgl_max_sz = ena->ena_device_hints.eh_max_tx_sgl; 1278 if (ena->ena_device_hints.eh_max_rx_sgl != 0) 1279 ena->ena_rx_sgl_max_sz = ena->ena_device_hints.eh_max_rx_sgl; 1280 1281 if (!ena_init_host_info(ena)) 1282 return (false); 1283 1284 if (!ena_aenq_configure(ena)) 1285 return (false); 1286 1287 ena_get_link_config(ena); 1288 1289 if (!ena_get_offloads(ena)) 1290 return (false); 1291 1292 if (!ena_stat_device_init(ena)) 1293 return (false); 1294 1295 if (!ena_stat_device_basic_init(ena)) 1296 return (false); 1297 1298 if (!ena_stat_device_extended_init(ena)) 1299 return (false); 1300 1301 if (!ena_stat_aenq_init(ena)) 1302 return (false); 1303 1304 ena_update_regcache(ena); 1305 1306 return (true); 1307 } 1308 1309 static void 1310 ena_cleanup_intr_alloc(ena_t *ena, bool resetting) 1311 { 1312 VERIFY0(resetting); 1313 1314 for (int i = 0; i < ena->ena_num_intrs; i++) { 1315 int ret = ddi_intr_free(ena->ena_intr_handles[i]); 1316 if (ret != DDI_SUCCESS) { 1317 ena_err(ena, "failed to free interrupt %d: %d", i, ret); 1318 } 1319 } 1320 1321 if (ena->ena_intr_handles != NULL) { 1322 kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz); 1323 ena->ena_intr_handles = NULL; 1324 ena->ena_intr_handles_sz = 0; 1325 } 1326 } 1327 1328 /* 1329 * The Linux driver supports only MSI-X interrupts. We do the same, 1330 * with the assumption that it's the only type of interrupt the device 1331 * can present. 1332 */ 1333 static bool 1334 ena_attach_intr_alloc(ena_t *ena) 1335 { 1336 int ret; 1337 int types; 1338 int min, req, ideal, avail, actual; 1339 1340 ret = ddi_intr_get_supported_types(ena->ena_dip, &types); 1341 if (ret != DDI_SUCCESS) { 1342 ena_err(ena, "failed to get interrupt types: %d", ret); 1343 return (false); 1344 } 1345 1346 ena_dbg(ena, "supported interrupt types: 0x%x", types); 1347 if ((types & DDI_INTR_TYPE_MSIX) == 0) { 1348 ena_err(ena, "the ena driver only supports MSI-X interrupts"); 1349 return (false); 1350 } 1351 1352 /* One for I/O, one for adminq. */ 1353 min = 2; 1354 ideal = ena->ena_max_io_queues + 1; 1355 ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1356 if (ret != DDI_SUCCESS) { 1357 ena_err(ena, "failed to get number of MSI-X interrupts: %d", 1358 ret); 1359 return (false); 1360 } 1361 1362 if (avail < min) { 1363 ena_err(ena, "number of MSI-X interrupts is %d, but the driver " 1364 "requires a minimum of %d", avail, min); 1365 return (false); 1366 } 1367 1368 ena_dbg(ena, "%d MSI-X interrupts available", avail); 1369 1370 ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1371 if (ret != DDI_SUCCESS) { 1372 ena_err(ena, "failed to get available interrupts: %d", ret); 1373 return (false); 1374 } 1375 1376 if (avail < min) { 1377 ena_err(ena, "number of available MSI-X interrupts is %d, " 1378 "but the driver requires a minimum of %d", avail, min); 1379 return (false); 1380 } 1381 1382 req = MIN(ideal, avail); 1383 ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t); 1384 ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP); 1385 1386 ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles, 1387 DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL); 1388 if (ret != DDI_SUCCESS) { 1389 ena_err(ena, "failed to allocate %d MSI-X interrupts: %d", 1390 req, ret); 1391 return (false); 1392 } 1393 1394 if (actual < min) { 1395 ena_err(ena, "number of allocated interrupts is %d, but the " 1396 "driver requires a minimum of %d", actual, min); 1397 return (false); 1398 } 1399 1400 ena->ena_num_intrs = actual; 1401 1402 ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps); 1403 if (ret != DDI_SUCCESS) { 1404 ena_err(ena, "failed to get interrupt capability: %d", ret); 1405 return (false); 1406 } 1407 1408 ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri); 1409 if (ret != DDI_SUCCESS) { 1410 ena_err(ena, "failed to get interrupt priority: %d", ret); 1411 return (false); 1412 } 1413 1414 ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u", 1415 actual, ena->ena_intr_caps, ena->ena_intr_pri); 1416 1417 /* 1418 * The ena_lock should not be held in the data path, but it is 1419 * held as part of the AENQ handler, which runs in interrupt 1420 * context. Therefore, we delayed the initialization of this 1421 * mutex until after the interrupts are allocated. 1422 */ 1423 mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER, 1424 DDI_INTR_PRI(ena->ena_intr_pri)); 1425 mutex_init(&ena->ena_watchdog_lock, NULL, MUTEX_DRIVER, NULL); 1426 1427 return (true); 1428 } 1429 1430 /* 1431 * Allocate the parent Rx queue structures. More importantly, this is 1432 * NOT allocating the queue descriptors or data buffers. Those are 1433 * allocated on demand as queues are started. 1434 */ 1435 static bool 1436 ena_attach_alloc_rxqs(ena_t *ena) 1437 { 1438 bool resetting = false; 1439 1440 if (ena->ena_rxqs == NULL) { 1441 /* 1442 * We rely on the interrupt priority for initializing the 1443 * mutexes. 1444 */ 1445 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1446 ena->ena_num_rxqs = ena->ena_num_intrs - 1; 1447 ASSERT3U(ena->ena_num_rxqs, >, 0); 1448 ena->ena_rxqs = kmem_zalloc( 1449 ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), KM_SLEEP); 1450 } else { 1451 resetting = true; 1452 } 1453 1454 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1455 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1456 1457 rxq->er_rxqs_idx = i; 1458 /* The 0th vector is for Admin + AENQ. */ 1459 rxq->er_intr_vector = i + 1; 1460 rxq->er_mrh = NULL; 1461 1462 if (!resetting) { 1463 mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER, 1464 DDI_INTR_PRI(ena->ena_intr_pri)); 1465 mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER, 1466 DDI_INTR_PRI(ena->ena_intr_pri)); 1467 } 1468 1469 rxq->er_ena = ena; 1470 rxq->er_sq_num_descs = ena->ena_rxq_num_descs; 1471 rxq->er_cq_num_descs = ena->ena_rxq_num_descs; 1472 1473 if (!ena_stat_rxq_init(rxq)) { 1474 return (false); 1475 } 1476 1477 if (!ena_alloc_rxq(rxq)) { 1478 ena_stat_rxq_cleanup(rxq); 1479 return (false); 1480 } 1481 } 1482 1483 return (true); 1484 } 1485 1486 static void 1487 ena_cleanup_rxqs(ena_t *ena, bool resetting) 1488 { 1489 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1490 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1491 1492 ena_cleanup_rxq(rxq, resetting); 1493 if (!resetting) { 1494 mutex_destroy(&rxq->er_lock); 1495 mutex_destroy(&rxq->er_stat_lock); 1496 } 1497 ena_stat_rxq_cleanup(rxq); 1498 } 1499 1500 if (!resetting) { 1501 kmem_free(ena->ena_rxqs, 1502 ena->ena_num_rxqs * sizeof (*ena->ena_rxqs)); 1503 ena->ena_rxqs = NULL; 1504 } 1505 } 1506 1507 /* 1508 * Allocate the parent Tx queue structures. More importantly, this is 1509 * NOT allocating the queue descriptors or data buffers. Those are 1510 * allocated on demand as a queue is started. 1511 */ 1512 static bool 1513 ena_attach_alloc_txqs(ena_t *ena) 1514 { 1515 bool resetting = false; 1516 1517 if (ena->ena_txqs == NULL) { 1518 /* 1519 * We rely on the interrupt priority for initializing the 1520 * mutexes. 1521 */ 1522 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1523 ena->ena_num_txqs = ena->ena_num_intrs - 1; 1524 ASSERT3U(ena->ena_num_txqs, >, 0); 1525 ena->ena_txqs = kmem_zalloc( 1526 ena->ena_num_txqs * sizeof (*ena->ena_txqs), KM_SLEEP); 1527 } else { 1528 resetting = true; 1529 } 1530 1531 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1532 ena_txq_t *txq = &ena->ena_txqs[i]; 1533 1534 txq->et_txqs_idx = i; 1535 /* The 0th vector is for Admin + AENQ. */ 1536 txq->et_intr_vector = i + 1; 1537 txq->et_mrh = NULL; 1538 1539 if (!resetting) { 1540 mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER, 1541 DDI_INTR_PRI(ena->ena_intr_pri)); 1542 mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER, 1543 DDI_INTR_PRI(ena->ena_intr_pri)); 1544 } 1545 1546 txq->et_ena = ena; 1547 txq->et_sq_num_descs = ena->ena_txq_num_descs; 1548 txq->et_cq_num_descs = ena->ena_txq_num_descs; 1549 1550 if (!ena_stat_txq_init(txq)) { 1551 return (false); 1552 } 1553 1554 if (!ena_alloc_txq(txq)) { 1555 ena_stat_txq_cleanup(txq); 1556 return (false); 1557 } 1558 } 1559 1560 return (true); 1561 } 1562 1563 static void 1564 ena_cleanup_txqs(ena_t *ena, bool resetting) 1565 { 1566 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1567 ena_txq_t *txq = &ena->ena_txqs[i]; 1568 1569 ena_cleanup_txq(txq, resetting); 1570 if (!resetting) { 1571 mutex_destroy(&txq->et_lock); 1572 mutex_destroy(&txq->et_stat_lock); 1573 } 1574 ena_stat_txq_cleanup(txq); 1575 } 1576 1577 if (!resetting) { 1578 kmem_free(ena->ena_txqs, 1579 ena->ena_num_txqs * sizeof (*ena->ena_txqs)); 1580 ena->ena_txqs = NULL; 1581 } 1582 } 1583 1584 /* 1585 * To reset the device we need to unwind some of the steps taken during attach 1586 * but, since the device could well be in a failed state, we cannot rely on 1587 * being able to talk via the admin queue to do things such as explicitly 1588 * destroy rings. We call selected cleanup handlers with the second parameter 1589 * set to "true" to indicate that we are resetting and should avoid such 1590 * communication. 1591 * 1592 * The existing DMA memory regions for the admin queue, async event queue and 1593 * host information are preserved but have their contents zeroed. 1594 * Experimentation has shown that the device hangs onto old async event queue 1595 * addresses, even through a reset, with surprising results if the addresses 1596 * happen to change. 1597 * 1598 * We clean up all of the Tx and Rx ring descriptors and the TCBs but leave the 1599 * allocated memory for the ring data and mutexes intact. Pointers to this 1600 * memory have already been provided to MAC, and the mutexes keep the rings 1601 * locked until we're ready to start them again. 1602 * 1603 * To ensure that other driver activity is excluded, we hold the mutexes on the 1604 * Tx and Rx rings throughout, and unset the `ENA_STATE_STARTED` bit in the 1605 * state, which causes the interrupt handlers to return without doing any work. 1606 * The admin interrupt, used for notifications of admin completions or new 1607 * asynchronous events, is masked after the device is reset until we're ready 1608 * to process them again. 1609 */ 1610 bool 1611 ena_reset(ena_t *ena, const enahw_reset_reason_t reason) 1612 { 1613 ena_txq_state_t tx_state[ena->ena_num_txqs]; 1614 ena_rxq_state_t rx_state[ena->ena_num_rxqs]; 1615 bool ret = false; 1616 1617 ena_err(ena, "resetting device with reason 0x%x [%s]", 1618 reason, enahw_reset_reason(reason)); 1619 1620 VERIFY0(ena->ena_state & ENA_STATE_RESETTING); 1621 atomic_or_32(&ena->ena_state, ENA_STATE_RESETTING); 1622 1623 VERIFY(ena->ena_state & ENA_STATE_STARTED); 1624 atomic_and_32(&ena->ena_state, ~ENA_STATE_STARTED); 1625 1626 mutex_enter(&ena->ena_lock); 1627 1628 ena_update_regcache(ena); 1629 1630 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1631 ena_txq_t *txq = &ena->ena_txqs[i]; 1632 1633 mutex_enter(&txq->et_lock); 1634 tx_state[i] = txq->et_state; 1635 if (txq->et_state & ENA_TXQ_STATE_RUNNING) 1636 ena_ring_tx_stop((mac_ring_driver_t)txq); 1637 } 1638 1639 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1640 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1641 1642 mutex_enter(&rxq->er_lock); 1643 rx_state[i] = rxq->er_state; 1644 if (rxq->er_state & ENA_RXQ_STATE_RUNNING) 1645 ena_ring_rx_stop((mac_ring_driver_t)rxq); 1646 } 1647 1648 if (!ena_device_reset(ena, reason)) { 1649 ena_err(ena, "reset: failed to reset device"); 1650 goto out; 1651 } 1652 1653 /* This masks the admin/aenq interrupt */ 1654 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK); 1655 1656 ena_cleanup_txqs(ena, true); 1657 ena_cleanup_rxqs(ena, true); 1658 1659 ena_release_all_cmd_ctx(ena); 1660 1661 if (!ena_admin_cq_init(ena) || !ena_admin_sq_init(ena)) { 1662 ena_err(ena, "reset: failed to program admin queues"); 1663 goto out; 1664 } 1665 1666 if (!ena_init_host_info(ena)) { 1667 ena_err(ena, "reset: failed to set host info"); 1668 goto out; 1669 } 1670 1671 if (!ena_aenq_init(ena) || !ena_aenq_configure(ena)) { 1672 ena_err(ena, "reset: failed to configure aenq"); 1673 goto out; 1674 } 1675 1676 if (!ena_set_mtu(ena)) { 1677 ena_err(ena, "reset: failed to set MTU"); 1678 goto out; 1679 } 1680 1681 if (!ena_attach_alloc_txqs(ena) || !ena_attach_alloc_rxqs(ena)) { 1682 ena_err(ena, "reset: failed to program IO queues"); 1683 goto out; 1684 } 1685 1686 ena_aenq_enable(ena); 1687 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK); 1688 1689 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1690 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1691 1692 mutex_exit(&rxq->er_lock); 1693 if (rx_state[i] & ENA_RXQ_STATE_RUNNING) { 1694 (void) ena_ring_rx_start((mac_ring_driver_t)rxq, 1695 rxq->er_m_gen_num); 1696 } 1697 } 1698 1699 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1700 ena_txq_t *txq = &ena->ena_txqs[i]; 1701 1702 mutex_exit(&txq->et_lock); 1703 if (tx_state[i] & ENA_TXQ_STATE_RUNNING) { 1704 (void) ena_ring_tx_start((mac_ring_driver_t)txq, 1705 txq->et_m_gen_num); 1706 } 1707 } 1708 1709 atomic_or_32(&ena->ena_state, ENA_STATE_STARTED); 1710 ret = true; 1711 1712 out: 1713 atomic_and_32(&ena->ena_state, ~ENA_STATE_RESETTING); 1714 mutex_exit(&ena->ena_lock); 1715 1716 ena_update_regcache(ena); 1717 1718 return (ret); 1719 } 1720 1721 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = { 1722 { 1723 .ead_seq = ENA_ATTACH_PCI, 1724 .ead_name = "PCI config", 1725 .ead_attach_fn = ena_attach_pci, 1726 .ead_attach_hard_fail = true, 1727 .ead_cleanup_fn = ena_cleanup_pci, 1728 }, 1729 1730 { 1731 .ead_seq = ENA_ATTACH_REGS, 1732 .ead_name = "BAR mapping", 1733 .ead_attach_fn = ena_attach_regs_map, 1734 .ead_attach_hard_fail = true, 1735 .ead_cleanup_fn = ena_cleanup_regs_map, 1736 }, 1737 1738 { 1739 .ead_seq = ENA_ATTACH_DEV_INIT, 1740 .ead_name = "device initialization", 1741 .ead_attach_fn = ena_attach_device_init, 1742 .ead_attach_hard_fail = true, 1743 .ead_cleanup_fn = ena_cleanup_device_init, 1744 }, 1745 1746 { 1747 .ead_seq = ENA_ATTACH_READ_CONF, 1748 .ead_name = "ena.conf", 1749 .ead_attach_fn = ena_attach_read_conf, 1750 .ead_attach_hard_fail = true, 1751 .ead_cleanup_fn = NULL, 1752 }, 1753 1754 { 1755 .ead_seq = ENA_ATTACH_DEV_CFG, 1756 .ead_name = "device config", 1757 .ead_attach_fn = ena_attach_dev_cfg, 1758 .ead_attach_hard_fail = true, 1759 .ead_cleanup_fn = NULL, 1760 }, 1761 1762 { 1763 .ead_seq = ENA_ATTACH_INTR_ALLOC, 1764 .ead_name = "interrupt allocation", 1765 .ead_attach_fn = ena_attach_intr_alloc, 1766 .ead_attach_hard_fail = true, 1767 .ead_cleanup_fn = ena_cleanup_intr_alloc, 1768 }, 1769 1770 { 1771 .ead_seq = ENA_ATTACH_INTR_HDLRS, 1772 .ead_name = "interrupt handlers", 1773 .ead_attach_fn = ena_intr_add_handlers, 1774 .ead_attach_hard_fail = true, 1775 .ead_cleanup_fn = ena_intr_remove_handlers, 1776 }, 1777 1778 { 1779 .ead_seq = ENA_ATTACH_TXQS_ALLOC, 1780 .ead_name = "Tx queues", 1781 .ead_attach_fn = ena_attach_alloc_txqs, 1782 .ead_attach_hard_fail = true, 1783 .ead_cleanup_fn = ena_cleanup_txqs, 1784 }, 1785 1786 { 1787 .ead_seq = ENA_ATTACH_RXQS_ALLOC, 1788 .ead_name = "Rx queues", 1789 .ead_attach_fn = ena_attach_alloc_rxqs, 1790 .ead_attach_hard_fail = true, 1791 .ead_cleanup_fn = ena_cleanup_rxqs, 1792 }, 1793 1794 /* 1795 * The chance of mac_unregister() failure poses a problem to 1796 * cleanup. We address interrupt disablement and mac 1797 * unregistration explicitly in the attach/detach routines. 1798 */ 1799 { 1800 .ead_seq = ENA_ATTACH_MAC_REGISTER, 1801 .ead_name = "mac registration", 1802 .ead_attach_fn = ena_mac_register, 1803 .ead_attach_hard_fail = true, 1804 .ead_cleanup_fn = NULL, 1805 }, 1806 1807 { 1808 .ead_seq = ENA_ATTACH_INTRS_ENABLE, 1809 .ead_name = "enable interrupts", 1810 .ead_attach_fn = ena_intrs_enable, 1811 .ead_attach_hard_fail = true, 1812 .ead_cleanup_fn = NULL, 1813 } 1814 }; 1815 1816 /* 1817 * This function undoes any work done by ena_attach(), either in 1818 * response to a failed attach or a planned detach. At the end of this 1819 * function ena_attach_seq should be zero, otherwise it means 1820 * something has not be freed/uninitialized. 1821 */ 1822 static void 1823 ena_cleanup(ena_t *ena) 1824 { 1825 if (ena == NULL || ena->ena_attach_seq == 0) { 1826 return; 1827 } 1828 1829 /* 1830 * We VERIFY this because if the seq is greater than entries 1831 * we drift into space and execute god knows what. 1832 */ 1833 VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES); 1834 1835 while (ena->ena_attach_seq > 0) { 1836 int idx = ena->ena_attach_seq - 1; 1837 ena_attach_desc_t *desc = &ena_attach_tbl[idx]; 1838 1839 ena_dbg(ena, "running cleanup sequence: %s (%d)", 1840 desc->ead_name, idx); 1841 1842 if (desc->ead_cleanup_fn != NULL) 1843 desc->ead_cleanup_fn(ena, false); 1844 ena->ena_attach_seq--; 1845 } 1846 1847 ASSERT3U(ena->ena_attach_seq, ==, 0); 1848 mutex_destroy(&ena->ena_lock); 1849 mutex_destroy(&ena->ena_watchdog_lock); 1850 } 1851 1852 static int 1853 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1854 { 1855 ena_t *ena; 1856 1857 if (cmd != DDI_ATTACH) { 1858 return (DDI_FAILURE); 1859 } 1860 1861 ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP); 1862 ena->ena_instance = ddi_get_instance(dip); 1863 ena->ena_dip = dip; 1864 ena->ena_instance = ddi_get_instance(dip); 1865 ena->ena_page_sz = ddi_ptob(dip, 1); 1866 1867 for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) { 1868 bool success; 1869 ena_attach_desc_t *desc = &ena_attach_tbl[i]; 1870 1871 ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name, 1872 i); 1873 1874 if (!(success = desc->ead_attach_fn(ena))) { 1875 ena_err(ena, "attach sequence failed: %s (%d)", 1876 desc->ead_name, i); 1877 1878 if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) { 1879 /* 1880 * In this specific case 1881 * ENA_ATTACH_INTRS_ENABLE has failed, 1882 * and we may or may not be able to 1883 * unregister the mac, depending on if 1884 * something in userspace has created 1885 * a client on top. 1886 * 1887 * NOTE: Something that would be nice 1888 * to add to mac is the ability to 1889 * register a provider separate from 1890 * "publishing" it to the rest of the 1891 * system. This would allow a driver 1892 * to register its mac, do some 1893 * additional work that might fail, 1894 * and then unregister if that work 1895 * fails without concern for any 1896 * chance of failure when calling 1897 * unregister. This would remove the 1898 * complexity of the situation we are 1899 * trying to address here, as we would 1900 * know that until the mac has been 1901 * "published", there is no chance for 1902 * mac_unregister() to fail. 1903 */ 1904 if (ena_mac_unregister(ena) != 0) { 1905 return (DDI_FAILURE); 1906 } 1907 1908 ena->ena_attach_seq--; 1909 } else { 1910 /* 1911 * Since the ead_seq is predicated on 1912 * successful ead_attach_fn we must 1913 * run the specific cleanup handler 1914 * before calling the global cleanup 1915 * routine. This also means that all 1916 * cleanup functions must be able to 1917 * deal with partial success of the 1918 * corresponding ead_attach_fn. 1919 */ 1920 if (desc->ead_cleanup_fn != NULL) 1921 desc->ead_cleanup_fn(ena, false); 1922 } 1923 1924 ena_cleanup(ena); 1925 kmem_free(ena, sizeof (ena_t)); 1926 return (DDI_FAILURE); 1927 } 1928 1929 if (success) { 1930 ena_dbg(ena, "attach sequence completed: %s (%d)", 1931 desc->ead_name, i); 1932 } 1933 1934 ena->ena_attach_seq = desc->ead_seq; 1935 } 1936 1937 /* 1938 * Now that interrupts are enabled, unmask the admin interrupt. 1939 * Note that this interrupt is generated for both the admin queue and 1940 * the AENQ, but this driver always polls the admin queue. The surplus 1941 * interrupt for admin command completion triggers a harmless check of 1942 * the AENQ. 1943 */ 1944 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK); 1945 ena_aenq_enable(ena); 1946 1947 ddi_set_driver_private(dip, ena); 1948 1949 ena_update_regcache(ena); 1950 1951 atomic_or_32(&ena->ena_state, ENA_STATE_INITIALIZED); 1952 1953 return (DDI_SUCCESS); 1954 } 1955 1956 static int 1957 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1958 { 1959 ena_t *ena = ddi_get_driver_private(dip); 1960 1961 if (ena == NULL) { 1962 return (DDI_FAILURE); 1963 } 1964 1965 /* 1966 * Before we can proceed to cleanup we have to treat 1967 * mac_unregister() explicitly -- if there are still 1968 * outstanding clients, then we can't proceed with detach or 1969 * cleanup. 1970 */ 1971 1972 /* 1973 * Why this would fail I don't know, but if we proceed to mac 1974 * unregister, then there is a good chance we will panic in 1975 * the Rx interrupt handler when calling mac_rx_ring() 1976 */ 1977 if (!ena_intrs_disable(ena)) { 1978 return (DDI_FAILURE); 1979 } 1980 1981 /* We can't detach if clients are actively using the device. */ 1982 if (ena_mac_unregister(ena) != 0) { 1983 (void) ena_intrs_enable(ena); 1984 return (DDI_FAILURE); 1985 } 1986 1987 /* 1988 * At this point we can proceed with the rest of cleanup on a 1989 * best-effort basis. 1990 */ 1991 ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC; 1992 ena_cleanup(ena); 1993 ddi_set_driver_private(dip, NULL); 1994 kmem_free(ena, sizeof (ena_t)); 1995 return (DDI_SUCCESS); 1996 } 1997 1998 static struct cb_ops ena_cb_ops = { 1999 .cb_open = nodev, 2000 .cb_close = nodev, 2001 .cb_strategy = nodev, 2002 .cb_print = nodev, 2003 .cb_dump = nodev, 2004 .cb_read = nodev, 2005 .cb_write = nodev, 2006 .cb_ioctl = nodev, 2007 .cb_devmap = nodev, 2008 .cb_mmap = nodev, 2009 .cb_segmap = nodev, 2010 .cb_chpoll = nochpoll, 2011 .cb_prop_op = ddi_prop_op, 2012 .cb_flag = D_MP, 2013 .cb_rev = CB_REV, 2014 .cb_aread = nodev, 2015 .cb_awrite = nodev 2016 }; 2017 2018 static struct dev_ops ena_dev_ops = { 2019 .devo_rev = DEVO_REV, 2020 .devo_refcnt = 0, 2021 .devo_getinfo = NULL, 2022 .devo_identify = nulldev, 2023 .devo_probe = nulldev, 2024 .devo_attach = ena_attach, 2025 .devo_detach = ena_detach, 2026 .devo_reset = nodev, 2027 .devo_quiesce = ddi_quiesce_not_supported, 2028 .devo_cb_ops = &ena_cb_ops 2029 }; 2030 2031 static struct modldrv ena_modldrv = { 2032 .drv_modops = &mod_driverops, 2033 .drv_linkinfo = "AWS ENA Ethernet", 2034 .drv_dev_ops = &ena_dev_ops 2035 }; 2036 2037 static struct modlinkage ena_modlinkage = { 2038 .ml_rev = MODREV_1, 2039 .ml_linkage = { &ena_modldrv, NULL } 2040 }; 2041 2042 int 2043 _init(void) 2044 { 2045 int ret; 2046 2047 mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME); 2048 2049 if ((ret = mod_install(&ena_modlinkage)) != 0) { 2050 mac_fini_ops(&ena_dev_ops); 2051 return (ret); 2052 } 2053 2054 return (ret); 2055 } 2056 2057 int 2058 _info(struct modinfo *modinfop) 2059 { 2060 return (mod_info(&ena_modlinkage, modinfop)); 2061 } 2062 2063 int 2064 _fini(void) 2065 { 2066 int ret; 2067 2068 if ((ret = mod_remove(&ena_modlinkage)) != 0) { 2069 return (ret); 2070 } 2071 2072 mac_fini_ops(&ena_dev_ops); 2073 return (ret); 2074 } 2075