1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 #include "ena_hw.h" 17 #include "ena.h" 18 19 /* 20 * Elastic Network Adapter (ENA) Driver 21 * ------------------------------------ 22 * 23 * The ena driver provides support for the AWS ENA device, also 24 * referred to as their "enhanced networking". This device is present 25 * on "Nitro"-based instances. It presents itself with the following 26 * PCI Vendor/Device IDs 27 * 28 * o 1d0f:0ec2 -- ENA PF 29 * o 1d0f:1ec2 -- ENA PF (Reserved) 30 * o 1d0f:ec20 -- ENA VF 31 * o 1d0f:ec21 -- ENA VF (Reserved) 32 * 33 * This driver provides support for only the essential features needed 34 * to drive traffic on an ENA device. Support for the following 35 * features IS NOT currently implemented. 36 * 37 * o Admin Queue Interrupts: queue completion events are always polled 38 * o AENQ keep alive 39 * o FMA 40 * o Rx checksum offloads 41 * o Tx checksum offloads 42 * o Tx DMA bind (borrow buffers) 43 * o Rx DMA bind (loaned buffers) 44 * o TSO 45 * o RSS 46 * o Low Latency Queues (LLQ) 47 * o Support for different Tx completion policies 48 * o More controlled Tx recycling and Rx refill 49 * 50 * Even without these features the ena driver should perform 51 * reasonably well. 52 * 53 * Driver vs. Hardware Types 54 * ------------------------- 55 * 56 * To properly communicate with the ENA device the driver must 57 * populate memory (registers and buffers) with specific types. These 58 * types are defined by the device and are found under the "common" 59 * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have 60 * simplified this a bit by defining all device-specific types in the 61 * ena_hw.h file. Furthermore, all device-specific types are given an 62 * "enahw" prefix. This makes it clear when we are dealing with a 63 * device type and when we are dealing with a driver type. 64 * 65 * [1]: https://github.com/amzn/amzn-drivers 66 * 67 * Groups, Rings (Queues), and Interrupts 68 * -------------------------------------- 69 * 70 * The ENA device presents one mac group. This single mac group 71 * represents the single unicast address that this device represents 72 * in your AWS instance. The ENA device presents no option for 73 * configuring additional MAC addresses, multicast, or promisc mode -- 74 * you receive only what AWS wants you to receive. 75 * 76 * This single mac group may have one or more rings. The ENA driver 77 * refers to rings as queues, for no special reason other than it was 78 * the dominant language in the Linux and FreeBSD drivers, and it 79 * spilled over into this port. The upper bound on number of queues is 80 * presented by the device. However, we don't just go with whatever 81 * number of queues the device reports; but rather we limit the queues 82 * based on other factors such as an absolute maximum, number of 83 * online CPUs, and number of available interrupts. The upper bound is 84 * calculated by ena_set_max_io_queues(), and that is used and 85 * possibly further restricted in ena_attach_intr_alloc(). As this 86 * point, ultimately, it is the number of available interrupts (minus 87 * one for the admin queue) that determines the number of queues: one 88 * Tx and one Rx on each I/O interrupt. 89 * 90 * NOTE: Perhaps it is overly restrictive to limit the number of 91 * queues to the number of I/O interrupts. Something worth considering 92 * on larger instances if they present far less interrupts than they 93 * do queues + CPUs. 94 * 95 * The ENA device presents MSI-X interrupts only. During attach the 96 * driver queries the number of available interrupts and sets aside 97 * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N). 98 * This means that a Tx/Rx queue at index 0 will map to vector 1, and 99 * so on. 100 * 101 * NOTE: The ENA driver currently doesn't make use of the Admin Queue 102 * interrupt. This interrupt is used to notify a the driver that a 103 * command response is read. The ENA driver always polls the Admin 104 * Queue for responses. 105 * 106 * Tx Queue Workings 107 * ----------------- 108 * 109 * A single Tx queue (ena_txq_t) is made up of one submission queue 110 * (SQ) and its paired completion queue (CQ). These two queues form a 111 * logical descriptor ring which is used to send packets out of the 112 * device -- where each SQ entry describes the packet to be sent 113 * (enahw_tx_desc_t) and each CQ entry describes the result of sending 114 * a packet (enahw_tx_cdesc_t). For this to work the host and device 115 * must agree on which descriptors are currently owned by the host 116 * (free for sending) and which are owned by the device (pending 117 * device completion). This state is tracked on the host side via head 118 * and tail indexes along with a phase value. 119 * 120 * The head and tail values represent the head and tail of the FIFO 121 * queue of pending packets -- the next packet to be sent by the 122 * device is head, and all descriptors up to tail are ready for 123 * sending. The phase allows the host to determine which CQ 124 * descriptors represent completed events when using per-SQ completion 125 * events (as opposed to queue head pointer updates). As the queues 126 * represent a logical ring buffer, the phase must alternate on 127 * wrap-around. The device initializes the phase to zero, and the host 128 * starts with a phase of 1. The first packet descriptor writes, and 129 * their corresponding completions, are indicated with a phase of 1. 130 * 131 * 132 * For example, the diagram below represents the SQ/CQ state after the 133 * first 6 packets have been sent by the host and 2 of them have been 134 * completed by the device (and these completions have been processed 135 * by the driver). In this state the host could send 4 more packets 136 * before needing to wait on completion events. 137 * 138 * 139 * +---+---+---+---+---+---+---+---+ 140 * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1 141 * +---+---+---+---+---+---+---+---+ 142 * ^ 143 * | 144 * tail 145 * head 146 * | 147 * v 148 * +---+---+---+---+---+---+---+---+ 149 * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1 150 * +---+---+---+---+---+---+---+---+ 151 * 152 * 153 * The next diagram shows how the state changes as 5 more packets are 154 * sent (for a total of 11) and 7 more are completed (for a total of 155 * 9). Notice that as the SQ and CQ have wrapped around their phases 156 * have been complemented. In this state the host could send 6 more 157 * packets before needing to wait on completion events. 158 * 159 * +---+---+---+---+---+---+---+---+ 160 * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0 161 * +---+---+---+---+---+---+---+---+ 162 * ^ 163 * | 164 * tail 165 * head 166 * | 167 * v 168 * +---+---+---+---+---+---+---+---+ 169 * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0 170 * +---+---+---+---+---+---+---+---+ 171 * 172 * 173 * Currently, all packets are copied for Tx. At ring start we allocate 174 * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has 175 * DMA buffer associated with it; and each buffer is large enough to 176 * hold the MTU. Therefore, Tx descriptors and TCBs currently have a 177 * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to 178 * the TCB's DMA buffer, and a new descriptor is written to the SQ 179 * describing said TCB buffer. If and when we add more advanced 180 * features like DMA binding of mblks and TSO, this 1:1 guarantee will 181 * no longer hold. 182 * 183 * Rx Queue Workings 184 * ----------------- 185 * 186 * In terms of implementing the logical descriptor ring, the Rx queues 187 * are very much like the Tx queues. There is a paired SQ and CQ for 188 * each logical ring. The difference is that in Rx the SQ is for 189 * handing buffers to the device to fill, and the CQ is for describing 190 * the contents of those buffers for a given received frame. At Rx 191 * ring start we allocate a Rx Control Buffer (RCB) for each 192 * descriptor in the ring. Each RCB has a DMA buffer associated with 193 * it; and each buffer is large enough to hold the MTU. For each 194 * received frame we copy the contents out of the RCB and into its own 195 * mblk, immediately returning the RCB for reuse. As with Tx, this 196 * gives us a simple 1:1 mapping currently, but if more advanced 197 * features are implemented later this could change. 198 * 199 * Asynchronous Event Notification Queue (AENQ) 200 * -------------------------------------------- 201 * 202 * Each ENA device comes with a mechanism for sending out-of-band 203 * notifications to the driver. This includes events like link state 204 * changes, fatal errors, and a watchdog/keep alive signal. The AENQ 205 * delivery mechanism is via interrupt, handled by the ena_aenq_work() 206 * function, which dispatches via the eaenq_hdlrs table. If no handler 207 * is registered, the ena_aenq_default_hdlr() handler is used. A given 208 * device may not support all the different event types 209 * (enahw_aenq_groups_t); and the driver may choose to enable a subset 210 * of the supported events. During attach we call ena_setup_aenq() to 211 * negotiate the supported/enabled events. The enabled group is stored 212 * at ena_aenq_enabled_groups. 213 * 214 * Queues and Unsigned Wraparound 215 * ------------------------------ 216 * 217 * All the queues use a uint16_t value as their head/tail values, e.g. 218 * the Rx queue's er_cq_head_idx value. You might notice that we only 219 * ever increment these values, letting them perform implicit unsigned 220 * integer wraparound. This is intended. This is the same behavior as 221 * the common code, and seems to be what the hardware expects. Of 222 * course, when accessing our own descriptor arrays we must make sure 223 * to first perform a modulo of this value or risk running off into 224 * space. 225 * 226 * Attach Sequencing 227 * ----------------- 228 * 229 * Most drivers implement their attach/detach/cleanup functions as a 230 * sequential stream of function calls used to allocate and initialize 231 * resources in an order determined by the device's programming manual 232 * combined with any requirements imposed by the kernel and its 233 * relevant modules. These functions can become quite long. It is 234 * often hard to see the order in which steps are taken, and even 235 * harder to tell if detach/cleanup undoes them in the correct order, 236 * or even if it undoes them at all! The only sure way to understand 237 * the flow is to take good notes while closely inspecting each line 238 * of code. Even then, it's easy for attach and detach to get out of 239 * sync. 240 * 241 * Some more recent drivers have improved on this situation by using a 242 * bit vector to track the sequence of events in attach/detach. Each 243 * bit is declared in as an enum value, in the same order it is 244 * expected attach would run, and thus detach would run in the exact 245 * opposite order. This has three main benefits: 246 * 247 * 1. It makes it easier to determine sequence order at a 248 * glance. 249 * 250 * 2. It gives a better idea of what state the device is in during 251 * debugging (the sequence bit vector is kept with the instance 252 * state). 253 * 254 * 3. The detach function can verify that all sequence bits are 255 * cleared, indicating that everything done in attach was 256 * successfully undone. 257 * 258 * These are great improvements. However, the attach/detach functions 259 * can still become unruly, and there is still no guarantee that 260 * detach is done in opposite order of attach (this is not always 261 * strictly required, but is probably the best way to write detach). 262 * There is still a lot of boilerplate and chance for programmer 263 * error. 264 * 265 * The ena driver takes the sequence idea a bit further, creating a 266 * descriptor table of the attach sequence (ena_attach_tbl). This 267 * table is used by attach/detach to generically, declaratively, and 268 * programmatically enforce the precise sequence order and verify that 269 * anything that is done is undone. This provides several benefits: 270 * 271 * o Correct order is enforced implicitly by the descriptor table. 272 * It is impossible for the detach sequence to run in any other 273 * order other than opposite that of attach. 274 * 275 * o It is obvious what the precise attach sequence is. While the 276 * bit vector enum helps a lot with this it doesn't prevent 277 * programmer error. With the sequence defined as a declarative 278 * table it makes it easy for the programmer to see the order and 279 * know it's followed exactly. 280 * 281 * o It is impossible to modify the attach sequence without also 282 * specifying a callback for its dual in the detach sequence. 283 * 284 * o Common and repetitive code like error checking, logging, and bit 285 * vector modification is eliminated and centralized, again 286 * reducing the chance of programmer error. 287 * 288 * The ena attach sequence is defined under ena_attach_seq_t. The 289 * descriptor table is defined under ena_attach_tbl. 290 */ 291 292 /* 293 * These are some basic data layout invariants on which development 294 * assumptions where made. 295 */ 296 CTASSERT(sizeof (enahw_aenq_desc_t) == 64); 297 /* TODO: Why doesn't this work? */ 298 /* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */ 299 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t)); 300 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t)); 301 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t)); 302 /* 303 * We add this here as an extra safety check to make sure that any 304 * addition to the AENQ group enum also updates the groups array num 305 * value. 306 */ 307 CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6); 308 309 /* 310 * Amazon does not specify the endianess of the ENA device. We assume 311 * it's the same as the bus, and we assume the CPU/bus is always 312 * little endian. 313 */ 314 #ifdef _BIG_ENDIAN 315 #error "ENA driver is little-endian only" 316 #endif 317 318 /* 319 * These values are used to communicate the driver version to the AWS 320 * hypervisor via the ena_set_host_info() function. We don't know what 321 * exactly AWS does with this info, but it's fairly safe to assume 322 * it's used solely for debug/informational purposes. The Linux driver 323 * updates these values frequently as bugs are fixed and features are 324 * added. 325 */ 326 #define ENA_DRV_VER_MAJOR 1 327 #define ENA_DRV_VER_MINOR 0 328 #define ENA_DRV_VER_SUBMINOR 0 329 330 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; 331 332 /* 333 * Log an error message. We leave the destination (console or system 334 * log) up to the caller 335 */ 336 void 337 ena_err(const ena_t *ena, const char *fmt, ...) 338 { 339 va_list ap; 340 341 va_start(ap, fmt); 342 if (ena != NULL && ena->ena_dip != NULL) { 343 vdev_err(ena->ena_dip, CE_WARN, fmt, ap); 344 } else { 345 vcmn_err(CE_WARN, fmt, ap); 346 } 347 va_end(ap); 348 } 349 350 /* 351 * Set this to B_TRUE to enable debug messages. 352 */ 353 boolean_t ena_debug = B_FALSE; 354 355 /* 356 * Log a debug message. We force all debug messages to go to the 357 * system log. 358 */ 359 void 360 ena_dbg(const ena_t *ena, const char *fmt, ...) 361 { 362 va_list ap; 363 364 if (ena_debug) { 365 char msg[1024]; 366 367 va_start(ap, fmt); 368 (void) vsnprintf(msg, sizeof (msg), fmt, ap); 369 va_end(ap); 370 371 if (ena != NULL && ena->ena_dip != NULL) { 372 dev_err(ena->ena_dip, CE_NOTE, "!%s", msg); 373 } else { 374 cmn_err(CE_NOTE, "!%s", msg); 375 } 376 } 377 } 378 379 ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = { 380 { .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" }, 381 { .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" }, 382 { .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" }, 383 { 384 .eag_type = ENAHW_AENQ_GROUP_NOTIFICATION, 385 .eag_str = "NOTIFICATION" 386 }, 387 { .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" }, 388 { 389 .eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES, 390 .eag_str = "REFRESH CAPABILITIES" 391 }, 392 }; 393 394 void 395 ena_aenq_work(ena_t *ena) 396 { 397 ena_aenq_t *aenq = &ena->ena_aenq; 398 uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); 399 boolean_t processed = B_FALSE; 400 enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod]; 401 uint64_t ts; 402 403 ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low; 404 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL); 405 406 while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) { 407 ena_aenq_hdlr_t hdlr; 408 409 ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM); 410 processed = B_TRUE; 411 ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64 412 " us", desc->ead_group, 413 ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome, 414 ts); 415 416 hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group]; 417 hdlr(ena, desc); 418 419 aenq->eaenq_head++; 420 head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); 421 422 if (head_mod == 0) { 423 aenq->eaenq_phase ^= 1; 424 } 425 426 desc = &aenq->eaenq_descs[head_mod]; 427 } 428 429 if (processed) { 430 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, 431 aenq->eaenq_head); 432 } 433 } 434 435 /* 436 * Use for attach sequences which perform no resource allocation (or 437 * global state modification) and thus require no subsequent 438 * deallocation. 439 */ 440 static void 441 ena_no_cleanup(ena_t *ena) 442 { 443 } 444 445 static boolean_t 446 ena_attach_pci(ena_t *ena) 447 { 448 ddi_acc_handle_t hdl; 449 450 if (pci_config_setup(ena->ena_dip, &hdl) != 0) { 451 return (B_FALSE); 452 } 453 454 ena->ena_pci_hdl = hdl; 455 ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID); 456 ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID); 457 ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID); 458 ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID); 459 ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID); 460 ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x", 461 ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev, 462 ena->ena_pci_svid, ena->ena_pci_sdid); 463 464 return (B_TRUE); 465 } 466 467 static void 468 ena_cleanup_pci(ena_t *ena) 469 { 470 pci_config_teardown(&ena->ena_pci_hdl); 471 } 472 473 static void 474 ena_cleanup_regs_map(ena_t *ena) 475 { 476 ddi_regs_map_free(&ena->ena_reg_hdl); 477 } 478 479 static boolean_t 480 ena_attach_regs_map(ena_t *ena) 481 { 482 int ret = 0; 483 484 if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) != 485 DDI_SUCCESS) { 486 ena_err(ena, "failed to get register set %d size", 487 ENA_REG_NUMBER); 488 return (B_FALSE); 489 } 490 491 ena_dbg(ena, "register size: %ld", ena->ena_reg_size); 492 bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr)); 493 ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1; 494 ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC; 495 ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 496 497 /* 498 * This function can return several different failure values, 499 * so we make sure to capture its return value for the purpose 500 * of logging. 501 */ 502 ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER, 503 &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr, 504 &ena->ena_reg_hdl); 505 506 if (ret != DDI_SUCCESS) { 507 ena_err(ena, "failed to map register set %d: %d", 508 ENA_REG_NUMBER, ret); 509 return (B_FALSE); 510 } 511 512 ena_dbg(ena, "registers mapped to base: 0x%p", 513 (void *)ena->ena_reg_base); 514 515 return (B_TRUE); 516 } 517 518 /* 519 * Free any resources related to the admin submission queue. 520 */ 521 static void 522 ena_admin_sq_free(ena_t *ena) 523 { 524 ena_dma_free(&ena->ena_aq.ea_sq.eas_dma); 525 } 526 527 /* 528 * Initialize the admin submission queue. 529 */ 530 static boolean_t 531 ena_admin_sq_init(ena_t *ena) 532 { 533 ena_adminq_t *aq = &ena->ena_aq; 534 ena_dma_buf_t *dma = &aq->ea_sq.eas_dma; 535 size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries); 536 uint32_t addr_low, addr_high, wval; 537 ena_dma_conf_t conf = { 538 .edc_size = size, 539 .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT, 540 .edc_sgl = 1, 541 .edc_endian = DDI_NEVERSWAP_ACC, 542 .edc_stream = B_FALSE, 543 }; 544 545 if (!ena_dma_alloc(ena, dma, &conf, size)) { 546 ena_err(ena, "failed to allocate DMA for Admin SQ"); 547 return (B_FALSE); 548 } 549 550 aq->ea_sq.eas_entries = (void *)dma->edb_va; 551 aq->ea_sq.eas_tail = 0; 552 aq->ea_sq.eas_phase = 1; 553 aq->ea_sq.eas_dbaddr = 554 (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB); 555 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 556 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 557 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 558 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low); 559 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high); 560 wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) | 561 ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries)); 562 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval); 563 return (B_TRUE); 564 } 565 566 /* 567 * Free any resources related to the admin completion queue. 568 */ 569 static void 570 ena_admin_cq_free(ena_t *ena) 571 { 572 ena_dma_free(&ena->ena_aq.ea_cq.eac_dma); 573 } 574 575 /* 576 * Initialize the admin completion queue. 577 */ 578 static boolean_t 579 ena_admin_cq_init(ena_t *ena) 580 { 581 ena_adminq_t *aq = &ena->ena_aq; 582 ena_dma_buf_t *dma = &aq->ea_cq.eac_dma; 583 size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries); 584 uint32_t addr_low, addr_high, wval; 585 ena_dma_conf_t conf = { 586 .edc_size = size, 587 .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT, 588 .edc_sgl = 1, 589 .edc_endian = DDI_NEVERSWAP_ACC, 590 .edc_stream = B_FALSE, 591 }; 592 593 if (!ena_dma_alloc(ena, dma, &conf, size)) { 594 ena_err(ena, "failed to allocate DMA for Admin CQ"); 595 return (B_FALSE); 596 } 597 598 aq->ea_cq.eac_entries = (void *)dma->edb_va; 599 aq->ea_cq.eac_head = 0; 600 aq->ea_cq.eac_phase = 1; 601 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 602 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 603 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 604 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low); 605 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high); 606 wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) | 607 ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries)); 608 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval); 609 return (B_TRUE); 610 } 611 612 static void 613 ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc) 614 { 615 ena_t *ena = data; 616 617 ena->ena_aenq_stat.eaes_default.value.ui64++; 618 ena_dbg(ena, "unimplemented handler for aenq group: %s", 619 ena_groups_str[desc->ead_group].eag_str); 620 } 621 622 static void 623 ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc) 624 { 625 ena_t *ena = data; 626 boolean_t is_up = (desc->ead_payload.link_change.flags & 627 ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0; 628 629 /* 630 * The interrupts are not enabled until after we register mac, 631 * so the mac handle should be valid. 632 */ 633 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER); 634 ena->ena_aenq_stat.eaes_link_change.value.ui64++; 635 636 mutex_enter(&ena->ena_lock); 637 638 /* 639 * Notify mac only on an actual change in status. 640 */ 641 if (ena->ena_link_up != is_up) { 642 if (is_up) { 643 mac_link_update(ena->ena_mh, LINK_STATE_UP); 644 } else { 645 mac_link_update(ena->ena_mh, LINK_STATE_DOWN); 646 } 647 } 648 649 ena->ena_link_up = is_up; 650 651 mutex_exit(&ena->ena_lock); 652 } 653 654 /* 655 * Free any resources related to the Async Event Notification Queue. 656 */ 657 static void 658 ena_aenq_free(ena_t *ena) 659 { 660 ena_dma_free(&ena->ena_aenq.eaenq_dma); 661 } 662 663 static void 664 ena_aenq_set_def_hdlrs(ena_aenq_t *aenq) 665 { 666 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr; 667 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr; 668 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr; 669 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] = 670 ena_aenq_default_hdlr; 671 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr; 672 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] = 673 ena_aenq_default_hdlr; 674 } 675 /* 676 * Initialize the Async Event Notification Queue. 677 */ 678 static boolean_t 679 ena_aenq_init(ena_t *ena) 680 { 681 ena_aenq_t *aenq = &ena->ena_aenq; 682 size_t size; 683 uint32_t addr_low, addr_high, wval; 684 ena_dma_conf_t conf; 685 686 aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS; 687 size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs); 688 689 /* BEGIN CSTYLED */ 690 conf = (ena_dma_conf_t) { 691 .edc_size = size, 692 .edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT, 693 .edc_sgl = 1, 694 .edc_endian = DDI_NEVERSWAP_ACC, 695 .edc_stream = B_FALSE, 696 }; 697 /* END CSTYLED */ 698 699 if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) { 700 ena_err(ena, "failed to allocate DMA for AENQ"); 701 return (B_FALSE); 702 } 703 704 aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va; 705 aenq->eaenq_head = 0; 706 aenq->eaenq_phase = 1; 707 bzero(aenq->eaenq_descs, size); 708 ena_aenq_set_def_hdlrs(aenq); 709 710 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = 711 ena_aenq_link_change_hdlr; 712 713 ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress); 714 addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress); 715 addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32); 716 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low); 717 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high); 718 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV); 719 wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) | 720 ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs)); 721 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval); 722 return (B_TRUE); 723 } 724 725 /* 726 * We limit the max number of I/O queues based on several aspects of 727 * the underlying hardware. 728 * 729 * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES, 730 * which comes from the common code and presumably is based on device 731 * constraints. 732 * 733 * 2. Next we latch the number of I/O queues to the number of online 734 * CPUs. The idea being that each queue is a parallel work stream, 735 * and having more queues than CPUs to flush them will not improve 736 * performance. The number of online CPUs can change dynamically, 737 * and that's okay, everything should still work fine, it just 738 * might not be ideal. 739 * 740 * 3. Next we latch the number of I/O queues to the smallest of the 741 * max Tx queues and max Rx queues. We could probably loosen this 742 * restriction in the future, and have separate max I/O queues for 743 * Tx and Rx. This is what Linux does, and seems like a fine place 744 * to start. 745 */ 746 static void 747 ena_set_max_io_queues(ena_t *ena) 748 { 749 uint32_t max = ENAHW_MAX_NUM_IO_QUEUES; 750 751 max = MIN(ncpus_online, max); 752 /* 753 * Supposedly a device could present a different number of SQs 754 * and CQs. This driver is designed in a way that requires 755 * each SQ to have a corresponding and dedicated CQ (how would 756 * it work otherwise). Therefore, we must check both values 757 * and find the minimum between them. 758 */ 759 max = MIN(ena->ena_tx_max_sq_num, max); 760 max = MIN(ena->ena_tx_max_cq_num, max); 761 max = MIN(ena->ena_rx_max_sq_num, max); 762 max = MIN(ena->ena_rx_max_cq_num, max); 763 764 765 /* This shouldn't happen, but just in case. */ 766 if (max == 0) { 767 max = 1; 768 } 769 770 ena->ena_max_io_queues = max; 771 } 772 773 /* 774 * We require that an Rx or Tx buffer be able to hold the maximum MTU 775 * along with the maximum frame header length. In this case we know 776 * ENA is presenting us an Ethernet frame so we add the size of an 777 * Ethernet VLAN header. Rx has the additional requirement of needing 778 * additional margin for the sake of IP header alignment. 779 */ 780 static void 781 ena_update_buf_sizes(ena_t *ena) 782 { 783 ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header); 784 ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu; 785 ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total, 786 ena->ena_page_sz, uint32_t); 787 ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total + 788 ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t); 789 } 790 791 static boolean_t 792 ena_get_offloads(ena_t *ena) 793 { 794 int ret = 0; 795 enahw_resp_desc_t resp; 796 enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload; 797 798 ena->ena_tx_l3_ipv4_csum = B_FALSE; 799 800 ena->ena_tx_l4_ipv4_part_csum = B_FALSE; 801 ena->ena_tx_l4_ipv4_full_csum = B_FALSE; 802 ena->ena_tx_l4_ipv4_lso = B_FALSE; 803 804 ena->ena_tx_l4_ipv6_part_csum = B_FALSE; 805 ena->ena_tx_l4_ipv6_full_csum = B_FALSE; 806 ena->ena_tx_l4_ipv6_lso = B_FALSE; 807 808 ena->ena_rx_l3_ipv4_csum = B_FALSE; 809 ena->ena_rx_l4_ipv4_csum = B_FALSE; 810 ena->ena_rx_l4_ipv6_csum = B_FALSE; 811 ena->ena_rx_hash = B_FALSE; 812 813 bzero(&resp, sizeof (resp)); 814 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG, 815 ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER); 816 817 if (ret == ENOTSUP) { 818 /* 819 * In this case the device does not support querying 820 * for hardware offloads. We take that as a sign that 821 * the device provides no offloads. 822 */ 823 return (B_TRUE); 824 } else if (ret != 0) { 825 ena_err(ena, "error getting stateless offload: %d", ret); 826 return (B_FALSE); 827 } 828 829 ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat); 830 831 ena->ena_tx_l4_ipv4_part_csum = 832 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat); 833 ena->ena_tx_l4_ipv4_full_csum = 834 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat); 835 ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat); 836 837 ena->ena_tx_l4_ipv6_part_csum = 838 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat); 839 ena->ena_tx_l4_ipv6_full_csum = 840 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat); 841 ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat); 842 843 ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat); 844 ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat); 845 ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat); 846 return (B_TRUE); 847 } 848 849 static int 850 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval, 851 const int defval) 852 { 853 int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip, 854 DDI_PROP_DONTPASS, propname, defval); 855 856 if (value > maxval) { 857 ena_err(ena, "user value %s=%d exceeded maximum, setting to %d", 858 propname, value, maxval); 859 value = maxval; 860 } 861 862 if (value < minval) { 863 ena_err(ena, "user value %s=%d below minimum, setting to %d", 864 propname, value, minval); 865 value = minval; 866 } 867 868 return (value); 869 } 870 871 static boolean_t 872 ena_set_mtu(ena_t *ena) 873 { 874 int ret = 0; 875 enahw_cmd_desc_t cmd; 876 enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu; 877 enahw_resp_desc_t resp; 878 879 bzero(&cmd, sizeof (cmd)); 880 bzero(&resp, sizeof (resp)); 881 feat->efm_mtu = ena->ena_mtu; 882 883 if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU, 884 ENAHW_FEAT_MTU_VER)) != 0) { 885 ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu, 886 ret); 887 return (B_FALSE); 888 } 889 890 return (B_TRUE); 891 } 892 893 static void 894 ena_get_link_config(ena_t *ena) 895 { 896 enahw_resp_desc_t resp; 897 enahw_feat_link_conf_t *feat = 898 &resp.erd_resp.erd_get_feat.ergf_link_conf; 899 boolean_t full_duplex; 900 901 bzero(&resp, sizeof (resp)); 902 903 if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG, 904 ENAHW_FEAT_LINK_CONFIG_VER) != 0) { 905 /* 906 * Some ENA devices do no support this feature. In 907 * those cases we report a 1Gbps link, full duplex. 908 * For the most accurate information on bandwidth 909 * limits see the official AWS documentation. 910 */ 911 ena->ena_link_speed_mbits = 1 * 1000 * 1000; 912 ena->ena_link_speeds = ENAHW_LINK_SPEED_1G; 913 ena->ena_link_duplex = LINK_DUPLEX_FULL; 914 ena->ena_link_autoneg = B_TRUE; 915 return; 916 } 917 918 ena->ena_link_speed_mbits = feat->eflc_speed; 919 ena->ena_link_speeds = feat->eflc_supported; 920 full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat); 921 ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL : 922 LINK_DUPLEX_HALF; 923 ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat); 924 } 925 926 /* 927 * Retrieve all configuration values which are modifiable via 928 * ena.conf, and set ena_t members accordingly. While the conf values 929 * have priority, they may be implicitly modified by the driver to 930 * meet resource constraints on a given platform. If no value is 931 * specified in the conf file, the driver will attempt to use the 932 * largest value supported. While there should be no value large 933 * enough, keep in mind that ena_get_prop() will cast the values to an 934 * int. 935 * 936 * This function should be called after the device is initialized, 937 * admin queue is established, and the hardware features/capabs have 938 * been queried; it should be called before mac registration. 939 */ 940 static boolean_t 941 ena_attach_read_conf(ena_t *ena) 942 { 943 uint32_t gcv; /* Greatest Common Value */ 944 945 /* 946 * We expect that the queue lengths are the same for both the 947 * CQ and SQ, but technically the device could return 948 * different lengths. For now the driver locks them together. 949 */ 950 gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs); 951 ASSERT3U(gcv, <=, INT_MAX); 952 ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS, 953 ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv); 954 955 ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT, 956 ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX, 957 ENA_PROP_RXQ_INTR_LIMIT_DEF); 958 959 gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs); 960 ASSERT3U(gcv, <=, INT_MAX); 961 ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS, 962 ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv); 963 964 return (B_TRUE); 965 } 966 967 /* 968 * Perform any necessary device configuration after the driver.conf 969 * has been read. 970 */ 971 static boolean_t 972 ena_attach_dev_cfg(ena_t *ena) 973 { 974 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF); 975 976 if (!ena_set_mtu(ena)) { 977 /* 978 * We don't expect this to fail, but we try a fallback 979 * first before failing the attach sequence. 980 */ 981 ena->ena_mtu = 1500; 982 ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu); 983 984 if (!ena_set_mtu(ena)) { 985 return (B_FALSE); 986 } 987 } 988 989 return (B_TRUE); 990 } 991 992 static boolean_t 993 ena_check_versions(ena_t *ena) 994 { 995 uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION); 996 uint32_t ctrl_vsn = 997 ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION); 998 999 ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn); 1000 ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn); 1001 1002 ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn); 1003 ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn); 1004 ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn); 1005 ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn); 1006 1007 ena_dbg(ena, "device version: %u.%u", 1008 ena->ena_dev_major_vsn, ena->ena_dev_minor_vsn); 1009 ena_dbg(ena, "controller version: %u.%u.%u implementation %u", 1010 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, 1011 ena->ena_ctrl_subminor_vsn, ena->ena_ctrl_impl_id); 1012 1013 if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) { 1014 ena_err(ena, "unsupported controller version: %u.%u.%u", 1015 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, 1016 ena->ena_ctrl_subminor_vsn); 1017 return (B_FALSE); 1018 } 1019 1020 return (B_TRUE); 1021 } 1022 1023 boolean_t 1024 ena_setup_aenq(ena_t *ena) 1025 { 1026 enahw_cmd_desc_t cmd; 1027 enahw_feat_aenq_t *cmd_feat = 1028 &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq; 1029 enahw_resp_desc_t resp; 1030 enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq; 1031 enahw_aenq_groups_t to_enable; 1032 1033 bzero(&resp, sizeof (resp)); 1034 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, 1035 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1036 return (B_FALSE); 1037 } 1038 1039 to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) | 1040 BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) | 1041 BIT(ENAHW_AENQ_GROUP_WARNING) | 1042 BIT(ENAHW_AENQ_GROUP_NOTIFICATION); 1043 to_enable &= resp_feat->efa_supported_groups; 1044 1045 bzero(&cmd, sizeof (cmd)); 1046 bzero(&resp, sizeof (cmd)); 1047 cmd_feat->efa_enabled_groups = to_enable; 1048 1049 if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG, 1050 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1051 return (B_FALSE); 1052 } 1053 1054 bzero(&resp, sizeof (resp)); 1055 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, 1056 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1057 return (B_FALSE); 1058 } 1059 1060 ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups; 1061 ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups; 1062 1063 for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) { 1064 ena_aenq_grpstr_t *grpstr = &ena_groups_str[i]; 1065 boolean_t supported = BIT(grpstr->eag_type) & 1066 resp_feat->efa_supported_groups; 1067 boolean_t enabled = BIT(grpstr->eag_type) & 1068 resp_feat->efa_enabled_groups; 1069 1070 ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str, 1071 supported ? "Y" : "N", enabled ? "Y" : "N"); 1072 } 1073 1074 return (B_TRUE); 1075 } 1076 1077 /* 1078 * Free all resources allocated as part of ena_device_init(). 1079 */ 1080 static void 1081 ena_cleanup_device_init(ena_t *ena) 1082 { 1083 ena_adminq_t *aq = &ena->ena_aq; 1084 1085 ena_free_host_info(ena); 1086 mutex_destroy(&aq->ea_sq_lock); 1087 mutex_destroy(&aq->ea_cq_lock); 1088 mutex_destroy(&aq->ea_stat_lock); 1089 list_destroy(&aq->ea_cmd_ctxs_free); 1090 kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen); 1091 ena_admin_sq_free(ena); 1092 ena_admin_cq_free(ena); 1093 ena_aenq_free(ena); 1094 ena_stat_device_basic_cleanup(ena); 1095 ena_stat_device_extended_cleanup(ena); 1096 ena_stat_aenq_cleanup(ena); 1097 } 1098 1099 static boolean_t 1100 ena_attach_device_init(ena_t *ena) 1101 { 1102 ena_adminq_t *aq = &ena->ena_aq; 1103 uint32_t rval, wval; 1104 uint8_t dma_width; 1105 hrtime_t timeout, cmd_timeout; 1106 hrtime_t expired; 1107 enahw_resp_desc_t resp; 1108 enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr; 1109 uint8_t *maddr; 1110 uint32_t supported_features; 1111 int ret = 0; 1112 1113 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1114 if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) { 1115 ena_err(ena, "device is not ready"); 1116 return (B_FALSE); 1117 } 1118 1119 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 1120 1121 /* 1122 * The device stores the reset timeout at 100ms resolution; we 1123 * normalize that to nanoseconds. 1124 */ 1125 timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100); 1126 1127 if (timeout == 0) { 1128 ena_err(ena, "device gave invalid reset timeout"); 1129 return (B_FALSE); 1130 } 1131 1132 expired = gethrtime() + timeout; 1133 1134 wval = ENAHW_DEV_CTL_DEV_RESET_MASK; 1135 wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) & 1136 ENAHW_DEV_CTL_RESET_REASON_MASK; 1137 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval); 1138 1139 /* 1140 * Make sure reset is in progress. 1141 */ 1142 while (1) { 1143 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1144 1145 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) { 1146 break; 1147 } 1148 1149 if (gethrtime() > expired) { 1150 ena_err(ena, "device reset start timed out"); 1151 return (B_FALSE); 1152 } 1153 1154 /* Sleep for 100 milliseconds. */ 1155 delay(drv_usectohz(100 * 1000)); 1156 } 1157 1158 /* 1159 * Reset the timeout counter for the next device request. 1160 */ 1161 expired = gethrtime() + timeout; 1162 1163 /* 1164 * Wait for the device reset to finish. 1165 */ 1166 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0); 1167 while (1) { 1168 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1169 1170 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) { 1171 break; 1172 } 1173 1174 if (gethrtime() > expired) { 1175 ena_err(ena, "device reset timed out"); 1176 return (B_FALSE); 1177 } 1178 1179 /* Sleep for 100 milliseconds. */ 1180 delay(drv_usectohz(100 * 1000)); 1181 } 1182 1183 if (!ena_check_versions(ena)) { 1184 return (B_FALSE); 1185 } 1186 1187 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 1188 dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval); 1189 ena->ena_dma_width = dma_width; 1190 1191 /* 1192 * As we are not using an interrupt for admin queue completion 1193 * signaling, we do not need a priority on these mutexes. If 1194 * that changes, we will have to rejigger some code to create 1195 * the admin queue interrupt before this function. 1196 */ 1197 mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL); 1198 mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL); 1199 mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL); 1200 aq->ea_qlen = ENA_ADMINQ_DEPTH; 1201 aq->ea_pending_cmds = 0; 1202 1203 aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen, 1204 KM_SLEEP); 1205 list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t), 1206 offsetof(ena_cmd_ctx_t, ectx_node)); 1207 1208 for (uint_t i = 0; i < aq->ea_qlen; i++) { 1209 ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i]; 1210 1211 ctx->ectx_id = i; 1212 ctx->ectx_pending = B_FALSE; 1213 ctx->ectx_cmd_opcode = ENAHW_CMD_NONE; 1214 ctx->ectx_resp = NULL; 1215 list_insert_tail(&aq->ea_cmd_ctxs_free, ctx); 1216 } 1217 1218 /* 1219 * The value stored in the device register is in the 1220 * resolution of 100 milliseconds. We normalize that to 1221 * nanoseconds. 1222 */ 1223 cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100); 1224 aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns); 1225 1226 if (aq->ea_cmd_timeout_ns == 0) { 1227 aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; 1228 } 1229 1230 if (!ena_admin_sq_init(ena)) { 1231 return (B_FALSE); 1232 } 1233 1234 if (!ena_admin_cq_init(ena)) { 1235 return (B_FALSE); 1236 } 1237 1238 if (!ena_aenq_init(ena)) { 1239 return (B_FALSE); 1240 } 1241 1242 /* 1243 * Start in polling mode until we've determined the number of queues 1244 * and are ready to configure and enable interrupts. 1245 */ 1246 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_MASK); 1247 aq->ea_poll_mode = B_TRUE; 1248 1249 bzero(&resp, sizeof (resp)); 1250 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES, 1251 ENAHW_FEAT_DEVICE_ATTRIBUTES_VER); 1252 1253 if (ret != 0) { 1254 ena_err(ena, "failed to get device attributes: %d", ret); 1255 return (B_FALSE); 1256 } 1257 1258 ena_dbg(ena, "impl ID: %u", feat->efda_impl_id); 1259 ena_dbg(ena, "device version: %u", feat->efda_device_version); 1260 ena_dbg(ena, "supported features: 0x%x", 1261 feat->efda_supported_features); 1262 ena_dbg(ena, "device capabilities: 0x%x", feat->efda_capabilities); 1263 ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width); 1264 ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with); 1265 maddr = feat->efda_mac_addr; 1266 ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1], 1267 maddr[2], maddr[3], maddr[4], maddr[5]); 1268 ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu); 1269 1270 bcopy(maddr, ena->ena_mac_addr, ETHERADDRL); 1271 ena->ena_max_mtu = feat->efda_max_mtu; 1272 ena->ena_capabilities = feat->efda_capabilities; 1273 supported_features = feat->efda_supported_features; 1274 ena->ena_supported_features = supported_features; 1275 feat = NULL; 1276 bzero(&resp, sizeof (resp)); 1277 1278 if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) { 1279 enahw_feat_max_queue_ext_t *feat_mqe = 1280 &resp.erd_resp.erd_get_feat.ergf_max_queue_ext; 1281 1282 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT, 1283 ENAHW_FEAT_MAX_QUEUES_EXT_VER); 1284 1285 if (ret != 0) { 1286 ena_err(ena, "failed to query max queues ext: %d", ret); 1287 return (B_FALSE); 1288 } 1289 1290 ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num; 1291 ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth; 1292 ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num; 1293 ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth; 1294 ena->ena_tx_max_desc_per_pkt = 1295 feat_mqe->efmqe_max_per_packet_tx_descs; 1296 ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size; 1297 1298 ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num; 1299 ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth; 1300 ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num; 1301 ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth; 1302 ena->ena_rx_max_desc_per_pkt = 1303 feat_mqe->efmqe_max_per_packet_rx_descs; 1304 1305 ena_set_max_io_queues(ena); 1306 } else { 1307 enahw_feat_max_queue_t *feat_mq = 1308 &resp.erd_resp.erd_get_feat.ergf_max_queue; 1309 1310 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM, 1311 ENAHW_FEAT_MAX_QUEUES_NUM_VER); 1312 1313 if (ret != 0) { 1314 ena_err(ena, "failed to query max queues: %d", ret); 1315 return (B_FALSE); 1316 } 1317 1318 ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num; 1319 ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1320 ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num; 1321 ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1322 ena->ena_tx_max_desc_per_pkt = 1323 feat_mq->efmq_max_per_packet_tx_descs; 1324 ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size; 1325 1326 ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num; 1327 ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1328 ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num; 1329 ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1330 ena->ena_rx_max_desc_per_pkt = 1331 feat_mq->efmq_max_per_packet_rx_descs; 1332 1333 ena_set_max_io_queues(ena); 1334 } 1335 1336 ena->ena_mtu = ena->ena_max_mtu; 1337 ena_update_buf_sizes(ena); 1338 /* 1339 * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL 1340 * sizes, for now we just force everything to use one 1341 * segment. 1342 */ 1343 ena->ena_tx_sgl_max_sz = 1; 1344 ena->ena_rx_sgl_max_sz = 1; 1345 1346 if (!ena_init_host_info(ena)) { 1347 return (B_FALSE); 1348 } 1349 1350 if (!ena_setup_aenq(ena)) { 1351 return (B_FALSE); 1352 } 1353 1354 ena_get_link_config(ena); 1355 1356 if (!ena_get_offloads(ena)) { 1357 return (B_FALSE); 1358 } 1359 1360 if (!ena_stat_device_basic_init(ena)) { 1361 return (B_FALSE); 1362 } 1363 1364 if (!ena_stat_device_extended_init(ena)) { 1365 return (B_FALSE); 1366 } 1367 1368 if (!ena_stat_aenq_init(ena)) { 1369 return (B_FALSE); 1370 } 1371 1372 return (B_TRUE); 1373 } 1374 1375 static void 1376 ena_cleanup_intr_alloc(ena_t *ena) 1377 { 1378 for (int i = 0; i < ena->ena_num_intrs; i++) { 1379 int ret = ddi_intr_free(ena->ena_intr_handles[i]); 1380 if (ret != DDI_SUCCESS) { 1381 ena_err(ena, "failed to free interrupt %d: %d", i, ret); 1382 } 1383 } 1384 1385 if (ena->ena_intr_handles != NULL) { 1386 kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz); 1387 ena->ena_intr_handles = NULL; 1388 ena->ena_intr_handles_sz = 0; 1389 } 1390 } 1391 1392 /* 1393 * The Linux driver supports only MSI-X interrupts. We do the same, 1394 * with the assumption that it's the only type of interrupt the device 1395 * can present. 1396 */ 1397 static boolean_t 1398 ena_attach_intr_alloc(ena_t *ena) 1399 { 1400 int ret; 1401 int types; 1402 int min, req, ideal, avail, actual; 1403 1404 ret = ddi_intr_get_supported_types(ena->ena_dip, &types); 1405 if (ret != DDI_SUCCESS) { 1406 ena_err(ena, "failed to get interrupt types: %d", ret); 1407 return (B_FALSE); 1408 } 1409 1410 ena_dbg(ena, "supported interrupt types: 0x%x", types); 1411 if ((types & DDI_INTR_TYPE_MSIX) == 0) { 1412 ena_err(ena, "the ena driver only supports MSI-X interrupts"); 1413 return (B_FALSE); 1414 } 1415 1416 /* One for I/O, one for adminq. */ 1417 min = 2; 1418 ideal = ena->ena_max_io_queues + 1; 1419 ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1420 if (ret != DDI_SUCCESS) { 1421 ena_err(ena, "failed to get number of MSI-X interrupts: %d", 1422 ret); 1423 return (B_FALSE); 1424 } 1425 1426 if (avail < min) { 1427 ena_err(ena, "number of MSI-X interrupts is %d, but the driver " 1428 "requires a minimum of %d", avail, min); 1429 return (B_FALSE); 1430 } 1431 1432 ena_dbg(ena, "%d MSI-X interrupts available", avail); 1433 1434 ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1435 if (ret != DDI_SUCCESS) { 1436 ena_err(ena, "failed to get available interrupts: %d", ret); 1437 return (B_FALSE); 1438 } 1439 1440 if (avail < min) { 1441 ena_err(ena, "number of available MSI-X interrupts is %d, " 1442 "but the driver requires a minimum of %d", avail, min); 1443 return (B_FALSE); 1444 } 1445 1446 req = MIN(ideal, avail); 1447 ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t); 1448 ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP); 1449 1450 ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles, 1451 DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL); 1452 if (ret != DDI_SUCCESS) { 1453 ena_err(ena, "failed to allocate %d MSI-X interrupts: %d", 1454 req, ret); 1455 return (B_FALSE); 1456 } 1457 1458 if (actual < min) { 1459 ena_err(ena, "number of allocated interrupts is %d, but the " 1460 "driver requires a minimum of %d", actual, min); 1461 return (B_FALSE); 1462 } 1463 1464 ena->ena_num_intrs = actual; 1465 1466 ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps); 1467 if (ret != DDI_SUCCESS) { 1468 ena_err(ena, "failed to get interrupt capability: %d", ret); 1469 return (B_FALSE); 1470 } 1471 1472 ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri); 1473 if (ret != DDI_SUCCESS) { 1474 ena_err(ena, "failed to get interrupt priority: %d", ret); 1475 return (B_FALSE); 1476 } 1477 1478 ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u", 1479 actual, ena->ena_intr_caps, ena->ena_intr_pri); 1480 1481 /* 1482 * The ena_lock should not be held in the datapath, but it is 1483 * held as part of the AENQ handler, which runs in interrupt 1484 * context. Therefore, we delayed the initialization of this 1485 * mutex until after the interrupts are allocated. 1486 */ 1487 mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER, 1488 DDI_INTR_PRI(ena->ena_intr_pri)); 1489 1490 return (B_TRUE); 1491 } 1492 1493 /* 1494 * Allocate the parent Rx queue structures. More importantly, this is 1495 * NOT allocating the queue descriptors or data buffers. Those are 1496 * allocated on demand as queues are started. 1497 */ 1498 static boolean_t 1499 ena_attach_alloc_rxqs(ena_t *ena) 1500 { 1501 /* We rely on the interrupt priority for initializing the mutexes. */ 1502 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1503 ena->ena_num_rxqs = ena->ena_num_intrs - 1; 1504 ASSERT3U(ena->ena_num_rxqs, >, 0); 1505 ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), 1506 KM_SLEEP); 1507 1508 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1509 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1510 1511 rxq->er_rxqs_idx = i; 1512 /* The 0th vector is for Admin + AENQ. */ 1513 rxq->er_intr_vector = i + 1; 1514 rxq->er_mrh = NULL; 1515 1516 mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER, 1517 DDI_INTR_PRI(ena->ena_intr_pri)); 1518 mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER, 1519 DDI_INTR_PRI(ena->ena_intr_pri)); 1520 1521 rxq->er_ena = ena; 1522 rxq->er_sq_num_descs = ena->ena_rxq_num_descs; 1523 rxq->er_cq_num_descs = ena->ena_rxq_num_descs; 1524 1525 if (!ena_stat_rxq_init(rxq)) { 1526 return (B_FALSE); 1527 } 1528 1529 if (!ena_alloc_rxq(rxq)) { 1530 return (B_FALSE); 1531 } 1532 } 1533 1534 return (B_TRUE); 1535 } 1536 1537 static void 1538 ena_cleanup_rxqs(ena_t *ena) 1539 { 1540 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1541 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1542 1543 ena_cleanup_rxq(rxq); 1544 mutex_destroy(&rxq->er_lock); 1545 mutex_destroy(&rxq->er_stat_lock); 1546 ena_stat_rxq_cleanup(rxq); 1547 } 1548 1549 kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs)); 1550 } 1551 1552 /* 1553 * Allocate the parent Tx queue structures. More importantly, this is 1554 * NOT allocating the queue descriptors or data buffers. Those are 1555 * allocated on demand as a queue is started. 1556 */ 1557 static boolean_t 1558 ena_attach_alloc_txqs(ena_t *ena) 1559 { 1560 /* We rely on the interrupt priority for initializing the mutexes. */ 1561 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1562 ena->ena_num_txqs = ena->ena_num_intrs - 1; 1563 ASSERT3U(ena->ena_num_txqs, >, 0); 1564 ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs), 1565 KM_SLEEP); 1566 1567 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1568 ena_txq_t *txq = &ena->ena_txqs[i]; 1569 1570 txq->et_txqs_idx = i; 1571 /* The 0th vector is for Admin + AENQ. */ 1572 txq->et_intr_vector = i + 1; 1573 txq->et_mrh = NULL; 1574 1575 mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER, 1576 DDI_INTR_PRI(ena->ena_intr_pri)); 1577 mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER, 1578 DDI_INTR_PRI(ena->ena_intr_pri)); 1579 1580 txq->et_ena = ena; 1581 txq->et_sq_num_descs = ena->ena_txq_num_descs; 1582 txq->et_cq_num_descs = ena->ena_txq_num_descs; 1583 1584 if (!ena_stat_txq_init(txq)) { 1585 return (B_FALSE); 1586 } 1587 1588 if (!ena_alloc_txq(txq)) { 1589 return (B_FALSE); 1590 } 1591 } 1592 1593 return (B_TRUE); 1594 } 1595 1596 static void 1597 ena_cleanup_txqs(ena_t *ena) 1598 { 1599 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1600 ena_txq_t *txq = &ena->ena_txqs[i]; 1601 1602 ena_cleanup_txq(txq); 1603 mutex_destroy(&txq->et_lock); 1604 mutex_destroy(&txq->et_stat_lock); 1605 ena_stat_txq_cleanup(txq); 1606 } 1607 1608 kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs)); 1609 } 1610 1611 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = { 1612 { 1613 .ead_seq = ENA_ATTACH_PCI, 1614 .ead_name = "PCI config", 1615 .ead_attach_fn = ena_attach_pci, 1616 .ead_attach_hard_fail = B_TRUE, 1617 .ead_cleanup_fn = ena_cleanup_pci, 1618 }, 1619 1620 { 1621 .ead_seq = ENA_ATTACH_REGS, 1622 .ead_name = "BAR mapping", 1623 .ead_attach_fn = ena_attach_regs_map, 1624 .ead_attach_hard_fail = B_TRUE, 1625 .ead_cleanup_fn = ena_cleanup_regs_map, 1626 }, 1627 1628 { 1629 .ead_seq = ENA_ATTACH_DEV_INIT, 1630 .ead_name = "device initialization", 1631 .ead_attach_fn = ena_attach_device_init, 1632 .ead_attach_hard_fail = B_TRUE, 1633 .ead_cleanup_fn = ena_cleanup_device_init, 1634 }, 1635 1636 { 1637 .ead_seq = ENA_ATTACH_READ_CONF, 1638 .ead_name = "ena.conf", 1639 .ead_attach_fn = ena_attach_read_conf, 1640 .ead_attach_hard_fail = B_TRUE, 1641 .ead_cleanup_fn = ena_no_cleanup, 1642 }, 1643 1644 { 1645 .ead_seq = ENA_ATTACH_DEV_CFG, 1646 .ead_name = "device config", 1647 .ead_attach_fn = ena_attach_dev_cfg, 1648 .ead_attach_hard_fail = B_TRUE, 1649 .ead_cleanup_fn = ena_no_cleanup, 1650 }, 1651 1652 { 1653 .ead_seq = ENA_ATTACH_INTR_ALLOC, 1654 .ead_name = "interrupt allocation", 1655 .ead_attach_fn = ena_attach_intr_alloc, 1656 .ead_attach_hard_fail = B_TRUE, 1657 .ead_cleanup_fn = ena_cleanup_intr_alloc, 1658 }, 1659 1660 { 1661 .ead_seq = ENA_ATTACH_INTR_HDLRS, 1662 .ead_name = "interrupt handlers", 1663 .ead_attach_fn = ena_intr_add_handlers, 1664 .ead_attach_hard_fail = B_TRUE, 1665 .ead_cleanup_fn = ena_intr_remove_handlers, 1666 }, 1667 1668 { 1669 .ead_seq = ENA_ATTACH_TXQS_ALLOC, 1670 .ead_name = "Tx queues", 1671 .ead_attach_fn = ena_attach_alloc_txqs, 1672 .ead_attach_hard_fail = B_TRUE, 1673 .ead_cleanup_fn = ena_cleanup_txqs, 1674 }, 1675 1676 { 1677 .ead_seq = ENA_ATTACH_RXQS_ALLOC, 1678 .ead_name = "Rx queues", 1679 .ead_attach_fn = ena_attach_alloc_rxqs, 1680 .ead_attach_hard_fail = B_TRUE, 1681 .ead_cleanup_fn = ena_cleanup_rxqs, 1682 }, 1683 1684 /* 1685 * The chance of mac_unregister() failure poses a problem to 1686 * cleanup. We address interrupt disablement and mac 1687 * unregistration explicitly in the attach/detach routines. 1688 */ 1689 { 1690 .ead_seq = ENA_ATTACH_MAC_REGISTER, 1691 .ead_name = "mac registration", 1692 .ead_attach_fn = ena_mac_register, 1693 .ead_attach_hard_fail = B_TRUE, 1694 .ead_cleanup_fn = ena_no_cleanup, 1695 }, 1696 1697 { 1698 .ead_seq = ENA_ATTACH_INTRS_ENABLE, 1699 .ead_name = "enable interrupts", 1700 .ead_attach_fn = ena_intrs_enable, 1701 .ead_attach_hard_fail = B_TRUE, 1702 .ead_cleanup_fn = ena_no_cleanup, 1703 } 1704 }; 1705 1706 /* 1707 * This function undoes any work done by ena_attach(), either in 1708 * response to a failed attach or a planned detach. At the end of this 1709 * function ena_attach_seq should be zero, otherwise it means 1710 * something has not be freed/uninitialized. 1711 */ 1712 static void 1713 ena_cleanup(ena_t *ena) 1714 { 1715 if (ena == NULL || ena->ena_attach_seq == 0) { 1716 return; 1717 } 1718 1719 /* 1720 * We VERIFY this because if the seq is greater than entries 1721 * we drift into space and execute god knows what. 1722 */ 1723 VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES); 1724 1725 while (ena->ena_attach_seq > 0) { 1726 int idx = ena->ena_attach_seq - 1; 1727 ena_attach_desc_t *desc = &ena_attach_tbl[idx]; 1728 1729 ena_dbg(ena, "running cleanup sequence: %s (%d)", 1730 desc->ead_name, idx); 1731 1732 desc->ead_cleanup_fn(ena); 1733 ena->ena_attach_seq--; 1734 } 1735 1736 ASSERT3U(ena->ena_attach_seq, ==, 0); 1737 mutex_destroy(&ena->ena_lock); 1738 } 1739 1740 static int 1741 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1742 { 1743 ena_t *ena; 1744 1745 if (cmd != DDI_ATTACH) { 1746 return (DDI_FAILURE); 1747 } 1748 1749 ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP); 1750 ena->ena_instance = ddi_get_instance(dip); 1751 ena->ena_dip = dip; 1752 ena->ena_instance = ddi_get_instance(dip); 1753 ena->ena_page_sz = ddi_ptob(dip, 1); 1754 1755 for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) { 1756 boolean_t success; 1757 ena_attach_desc_t *desc = &ena_attach_tbl[i]; 1758 1759 ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name, 1760 i); 1761 1762 if (!(success = desc->ead_attach_fn(ena))) { 1763 ena_err(ena, "attach sequence failed: %s (%d)", 1764 desc->ead_name, i); 1765 1766 if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) { 1767 /* 1768 * In this specific case 1769 * ENA_ATTACH_INTRS_ENABLE has failed, 1770 * and we may or may not be able to 1771 * unregister the mac, depending on if 1772 * something in userspace has created 1773 * a client on top. 1774 * 1775 * NOTE: Something that would be nice 1776 * to add to mac is the ability to 1777 * register a provider separate from 1778 * "publishing" it to the rest of the 1779 * system. This would allow a driver 1780 * to register its mac, do some 1781 * additional work that might fail, 1782 * and then unregister if that work 1783 * fails without concern for any 1784 * chance of failure when calling 1785 * unregister. This would remove the 1786 * complexity of the situation we are 1787 * trying to address here, as we would 1788 * know that until the mac has been 1789 * "published", there is no chance for 1790 * mac_unregister() to fail. 1791 */ 1792 if (ena_mac_unregister(ena) != 0) { 1793 return (DDI_FAILURE); 1794 } 1795 1796 ena->ena_attach_seq--; 1797 } else { 1798 /* 1799 * Since the ead_seq is predicated on 1800 * successful ead_attach_fn we must 1801 * run the specific cleanup handler 1802 * before calling the global cleanup 1803 * routine. This also means that all 1804 * cleanup functions must be able to 1805 * deal with partial success of the 1806 * corresponding ead_attach_fn. 1807 */ 1808 desc->ead_cleanup_fn(ena); 1809 } 1810 1811 ena_cleanup(ena); 1812 kmem_free(ena, sizeof (ena_t)); 1813 return (DDI_FAILURE); 1814 } 1815 1816 if (success) { 1817 ena_dbg(ena, "attach sequence completed: %s (%d)", 1818 desc->ead_name, i); 1819 } 1820 1821 ena->ena_attach_seq = desc->ead_seq; 1822 } 1823 1824 /* 1825 * Now that interrupts are enabled make sure to tell the 1826 * device that all AENQ descriptors are ready for writing, and 1827 * unmask the admin interrupt. 1828 * 1829 * Note that this interrupt is generated for both the admin queue and 1830 * the AENQ, but this driver always polls the admin queue. The surplus 1831 * interrupt for admin command completion triggers a harmless check of 1832 * the AENQ. 1833 */ 1834 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, ENAHW_INTR_UNMASK); 1835 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, 1836 ena->ena_aenq.eaenq_num_descs); 1837 1838 ddi_set_driver_private(dip, ena); 1839 return (DDI_SUCCESS); 1840 } 1841 1842 static int 1843 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1844 { 1845 ena_t *ena = ddi_get_driver_private(dip); 1846 1847 if (ena == NULL) { 1848 return (DDI_FAILURE); 1849 } 1850 1851 /* 1852 * Before we can proceed to cleanup we have to treat 1853 * mac_unregister() explicitly -- if there are still 1854 * outstanding clients, then we can't proceed with detach or 1855 * cleanup. 1856 */ 1857 1858 /* 1859 * Why this would fail I don't know, but if we proceed to mac 1860 * unregister, then there is a good chance we will panic in 1861 * the Rx interrupt handler when calling mac_rx_ring() 1862 */ 1863 if (!ena_intrs_disable(ena)) { 1864 return (DDI_FAILURE); 1865 } 1866 1867 /* We can't detach if clients are actively using the device. */ 1868 if (ena_mac_unregister(ena) != 0) { 1869 (void) ena_intrs_enable(ena); 1870 return (DDI_FAILURE); 1871 } 1872 1873 /* 1874 * At this point we can proceed with the rest of cleanup on a 1875 * best-effort basis. 1876 */ 1877 ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC; 1878 ena_cleanup(ena); 1879 ddi_set_driver_private(dip, NULL); 1880 kmem_free(ena, sizeof (ena_t)); 1881 return (DDI_SUCCESS); 1882 } 1883 1884 static struct cb_ops ena_cb_ops = { 1885 .cb_open = nodev, 1886 .cb_close = nodev, 1887 .cb_strategy = nodev, 1888 .cb_print = nodev, 1889 .cb_dump = nodev, 1890 .cb_read = nodev, 1891 .cb_write = nodev, 1892 .cb_ioctl = nodev, 1893 .cb_devmap = nodev, 1894 .cb_mmap = nodev, 1895 .cb_segmap = nodev, 1896 .cb_chpoll = nochpoll, 1897 .cb_prop_op = ddi_prop_op, 1898 .cb_flag = D_MP, 1899 .cb_rev = CB_REV, 1900 .cb_aread = nodev, 1901 .cb_awrite = nodev 1902 }; 1903 1904 static struct dev_ops ena_dev_ops = { 1905 .devo_rev = DEVO_REV, 1906 .devo_refcnt = 0, 1907 .devo_getinfo = NULL, 1908 .devo_identify = nulldev, 1909 .devo_probe = nulldev, 1910 .devo_attach = ena_attach, 1911 .devo_detach = ena_detach, 1912 .devo_reset = nodev, 1913 .devo_quiesce = ddi_quiesce_not_supported, 1914 .devo_cb_ops = &ena_cb_ops 1915 }; 1916 1917 static struct modldrv ena_modldrv = { 1918 .drv_modops = &mod_driverops, 1919 .drv_linkinfo = "AWS ENA Ethernet", 1920 .drv_dev_ops = &ena_dev_ops 1921 }; 1922 1923 static struct modlinkage ena_modlinkage = { 1924 .ml_rev = MODREV_1, 1925 .ml_linkage = { &ena_modldrv, NULL } 1926 }; 1927 1928 int 1929 _init(void) 1930 { 1931 int ret; 1932 1933 mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME); 1934 1935 if ((ret = mod_install(&ena_modlinkage)) != 0) { 1936 mac_fini_ops(&ena_dev_ops); 1937 return (ret); 1938 } 1939 1940 return (ret); 1941 } 1942 1943 int 1944 _info(struct modinfo *modinfop) 1945 { 1946 return (mod_info(&ena_modlinkage, modinfop)); 1947 } 1948 1949 int 1950 _fini(void) 1951 { 1952 int ret; 1953 1954 if ((ret = mod_remove(&ena_modlinkage)) != 0) { 1955 return (ret); 1956 } 1957 1958 mac_fini_ops(&ena_dev_ops); 1959 return (ret); 1960 } 1961