1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2021 Oxide Computer Company 14 */ 15 16 #include "ena_hw.h" 17 #include "ena.h" 18 19 /* 20 * Elastic Network Adapter (ENA) Driver 21 * ------------------------------------ 22 * 23 * The ena driver provides support for the AWS ENA device, also 24 * referred to as their "enhanced networking". This device is present 25 * on "Nitro"-based instances. It presents itself with the following 26 * PCI Vendor/Device IDs 27 * 28 * o 1d0f:0ec2 -- ENA PF 29 * o 1d0f:1ec2 -- ENA PF (Reserved) 30 * o 1d0f:ec20 -- ENA VF 31 * o 1d0f:ec21 -- ENA VF (Reserved) 32 * 33 * This driver provides support for only the essential features needed 34 * to drive traffic on an ENA device. Support for the following 35 * features IS NOT currently implemented. 36 * 37 * o Admin Queue Interrupts: queue completion events are always polled 38 * o AENQ keep alive 39 * o FMA 40 * o Rx checksum offloads 41 * o Tx checksum offloads 42 * o Tx DMA bind (borrow buffers) 43 * o Rx DMA bind (loaned buffers) 44 * o TSO 45 * o RSS 46 * o Low Latency Queues (LLQ) 47 * o Support for different Tx complection policies 48 * o More controlled Tx recycling and Rx refill 49 * 50 * Even without these features the ena driver should perform 51 * reasonably well. 52 * 53 * Driver vs. Hardware Types 54 * ------------------------- 55 * 56 * To properly communicate with the ENA device the driver must 57 * populate memory (registers and buffers) with specific types. These 58 * types are defined by the device and are found under the "common" 59 * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have 60 * simplified this a bit by defining all device-specific types in the 61 * ena_hw.h file. Furthermore, all device-specific types are given an 62 * "enahw" prefix. This makes it clear when we are dealing with a 63 * device type and when we are dealing with a driver type. 64 * 65 * [1]: https://github.com/amzn/amzn-drivers 66 * 67 * Groups, Rings (Queues), and Interrupts 68 * -------------------------------------- 69 * 70 * The ENA device presents one mac group. This single mac group 71 * represents the single unicast address that this device represents 72 * in your AWS instance. The ENA device presents no option for 73 * configuring additional MAC addresses, multicast, or promisc mode -- 74 * you receive only what AWS wants you to receive. 75 * 76 * This single mac group may have one or more rings. The ENA driver 77 * refers to rings as queues, for no special reason other than it was 78 * the dominant language in the Linux and FreeBSD drivers, and it 79 * spilled over into this port. The upper bound on number of queues is 80 * presented by the device. However, we don't just go with whatever 81 * number of queues the device reports; but rather we limit the queues 82 * based on other factors such as an absolute maximum, number of 83 * online CPUs, and number of available interrupts. The upper bound is 84 * calculated by ena_set_max_io_queues(), and that is used and 85 * possibly further restricted in ena_attach_intr_alloc(). As this 86 * point, ultimately, it is the number of available interrupts (minus 87 * one for the admin queue) that determines the number of queues: one 88 * Tx and one Rx on each I/O interrupt. 89 * 90 * NOTE: Perhaps it is overly restrictive to limit the number of 91 * queues to the number of I/O interrupts. Something worth considering 92 * on larger instances if they present far less interrupts than they 93 * do queues + CPUs. 94 * 95 * The ENA device presents MSI-X interrupts only. During attach the 96 * driver queries the number of available interrupts and sets aside 97 * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N). 98 * This means that a Tx/Rx queue at index 0 will map to vector 1, and 99 * so on. 100 * 101 * NOTE: The ENA driver currently doesn't make use of the Admin Queue 102 * interrupt. This interrupt is used to notify a the driver that a 103 * command response is read. The ENA driver always polls the Admin 104 * Queue for responses. 105 * 106 * Tx Queue Workings 107 * ----------------- 108 * 109 * A single Tx queue (ena_txq_t) is made up of one submission queue 110 * (SQ) and its paired completion queue (CQ). These two queues form a 111 * logical descriptor ring which is used to send packets out of the 112 * device -- where each SQ entry describes the packet to be sent 113 * (enahw_tx_desc_t) and each CQ entry describes the result of sending 114 * a packet (enahw_tx_cdesc_t). For this to work the host and device 115 * must agree on which descriptors are currently owned by the host 116 * (free for sending) and which are owned by the device (pending 117 * device completion). This state is tracked on the host side via head 118 * and tail indexes along with a phase value. 119 * 120 * The head and tail values represent the head and tail of the FIFO 121 * queue of pending packets -- the next packet to be sent by the 122 * device is head, and all descriptors up to tail are ready for 123 * sending. The phase allows the host to determine which CQ 124 * descriptors represent completed events when using per-SQ completion 125 * events (as opposed to queue head pointer updates). As the queues 126 * represent a logical ring buffer, the phase must alternate on 127 * wrap-around. The device initializes the phase to zero, and the host 128 * starts with a phase of 1. The first packet descriptor writes, and 129 * their corresponding completions, are indicated with a phase of 1. 130 * 131 * 132 * For example, the diagram below represents the SQ/CQ state after the 133 * first 6 packets have been sent by the host and 2 of them have been 134 * completed by the device (and these completions have been processed 135 * by the driver). In this state the host could send 4 more packets 136 * before needing to wait on completion events. 137 * 138 * 139 * +---+---+---+---+---+---+---+---+ 140 * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1 141 * +---+---+---+---+---+---+---+---+ 142 * ^ 143 * | 144 * tail 145 * head 146 * | 147 * v 148 * +---+---+---+---+---+---+---+---+ 149 * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1 150 * +---+---+---+---+---+---+---+---+ 151 * 152 * 153 * The next diagram shows how the state changes as 5 more packets are 154 * sent (for a total of 11) and 7 more are completed (for a total of 155 * 9). Notice that as the SQ and CQ have wrapped around their phases 156 * have been complemented. In this state the host could send 6 more 157 * packets before needing to wait on completion events. 158 * 159 * +---+---+---+---+---+---+---+---+ 160 * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0 161 * +---+---+---+---+---+---+---+---+ 162 * ^ 163 * | 164 * tail 165 * head 166 * | 167 * v 168 * +---+---+---+---+---+---+---+---+ 169 * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0 170 * +---+---+---+---+---+---+---+---+ 171 * 172 * 173 * Currently, all packets are copied for Tx. At ring start we allocate 174 * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has 175 * DMA buffer associated with it; and each buffer is large enough to 176 * hold the MTU. Therefore, Tx descriptors and TCBs currently have a 177 * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to 178 * the TCB's DMA buffer, and a new descriptor is written to the SQ 179 * describing said TCB buffer. If and when we add more advanced 180 * features like DMA binding of mblks and TSO, this 1:1 guarantee will 181 * no longer hold. 182 * 183 * Rx Queue Workings 184 * ----------------- 185 * 186 * In terms of implementing the logical descriptor ring, the Rx queues 187 * are very much like the Tx queues. There is a paired SQ and CQ for 188 * each logical ring. The difference is that in Rx the SQ is for 189 * handing buffers to the device to fill, and the CQ is for describing 190 * the contents of those buffers for a given received frame. At Rx 191 * ring start we allocate a Rx Control Buffer (RCB) for each 192 * descriptor in the ring. Each RCB has a DMA buffer associated with 193 * it; and each buffer is large enough to hold the MTU. For each 194 * received frame we copy the contents out of the RCB and into its own 195 * mblk, immediately returning the RCB for reuse. As with Tx, this 196 * gives us a simple 1:1 mapping currently, but if more advanced 197 * features are implemented later this could change. 198 * 199 * Asynchronous Event Notification Queue (AENQ) 200 * -------------------------------------------- 201 * 202 * Each ENA device comes with a mechanism for sending out-of-band 203 * notifications to the driver. This includes events like link state 204 * changes, fatal errors, and a watchdog/keep alive signal. The AENQ 205 * delivery mechanism is via interrupt, handled by the ena_aenq_work() 206 * function, which dispatches via the eaenq_hdlrs table. If no handler 207 * is registered, the ena_aenq_default_hdlr() handler is used. A given 208 * device may not support all the different event types 209 * (enahw_aenq_groups_t); and the driver may choose to enable a subset 210 * of the supported events. During attach we call ena_setup_aenq() to 211 * negotiate the supported/enabled events. The enabled group is stored 212 * at ena_aenq_enabled_groups. 213 * 214 * Queues and Unsigned Wraparound 215 * ------------------------------ 216 * 217 * All the queues use a uint16_t value as their head/tail values, e.g. 218 * the Rx queue's er_cq_head_idx value. You might notice that we only 219 * ever increment these values, letting them perform implicit unsigned 220 * integer wraparound. This is intended. This is the same behavior as 221 * the common code, and seems to be what the hardware expects. Of 222 * course, when accessing our own descriptor arrays we must make sure 223 * to first perform a modulo of this value or risk running off into 224 * space. 225 * 226 * Attach Sequencing 227 * ----------------- 228 * 229 * Most drivers implement their attach/detach/cleanup functions as a 230 * sequential stream of function calls used to allocate and initialize 231 * resources in an order determined by the device's programming manual 232 * combined with any requirements imposed by the kernel and its 233 * relevant modules. These functions can become quite long. It is 234 * often hard to see the order in which steps are taken, and even 235 * harder to tell if detach/cleanup undoes them in the correct order, 236 * or even if it undoes them at all! The only sure way to understand 237 * the flow is to take good notes while closely inspecting each line 238 * of code. Even then, it's easy for attach and detach to get out of 239 * sync. 240 * 241 * Some more recent drivers have improved on this situation by using a 242 * bit vector to track the sequence of events in attach/detach. Each 243 * bit is declared in as an enum value, in the same order it is 244 * expected attach would run, and thus detach would run in the exact 245 * opposite order. This has three main benefits: 246 * 247 * 1. It makes it easier to determine sequence order at a 248 * glance. 249 * 250 * 2. It gives a better idea of what state the device is in during 251 * debugging (the sequence bit vector is kept with the instance 252 * state). 253 * 254 * 3. The detach function can verify that all sequence bits are 255 * cleared, indicating that everything done in attach was 256 * successfully undone. 257 * 258 * These are great improvements. However, the attach/detach functions 259 * can still become unruly, and there is still no guarantee that 260 * detach is done in opposite order of attach (this is not always 261 * strictly required, but is probably the best way to write detach). 262 * There is still a lot of boilerplate and chance for programmer 263 * error. 264 * 265 * The ena driver takes the sequence idea a bit further, creating a 266 * descriptor table of the attach sequence (ena_attach_tbl). This 267 * table is used by attach/detach to generically, declaratively, and 268 * programmaticaly enforce the precise sequence order and verify that 269 * anything that is done is undone. This provides several benefits: 270 * 271 * o Correct order is enforced implicitly by the descriptor table. 272 * It is impossible for the detach sequence to run in any other 273 * order other than opposite that of attach. 274 * 275 * o It is obvious what the precise attach sequence is. While the 276 * bit vector enum helps a lot with this it doesn't prevent 277 * programmer error. With the sequence defined as a declarative 278 * table it makes it easy for the programmer to see the order and 279 * know it's followed exactly. 280 * 281 * o It is impossible to modify the attach sequence without also 282 * specifying a callback for its dual in the detach sequence. 283 * 284 * o Common and repetitive code like error checking, logging, and bit 285 * vector modification is eliminated and centralized, again 286 * reducing the chance of programmer error. 287 * 288 * The ena attach sequence is defined under ena_attach_seq_t. The 289 * descriptor table is defined under ena_attach_tbl. 290 */ 291 292 /* 293 * These are some basic data layout invariants on which development 294 * assumptions where made. 295 */ 296 CTASSERT(sizeof (enahw_aenq_desc_t) == 64); 297 /* TODO: Why doesn't this work? */ 298 /* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */ 299 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t)); 300 CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t)); 301 CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t)); 302 /* 303 * We add this here as an extra safety check to make sure that any 304 * addition to the AENQ group enum also updates the groups array num 305 * value. 306 */ 307 CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6); 308 309 /* 310 * Amazon does not specify the endianess of the ENA device. We assume 311 * it's the same as the bus, and we assume the CPU/bus is always 312 * little endian. 313 */ 314 #ifdef _BIG_ENDIAN 315 #error "ENA driver is little-endian only" 316 #endif 317 318 /* 319 * These values are used to communicate the driver version to the AWS 320 * hypervisor via the ena_set_host_info() function. We don't know what 321 * exactly AWS does with this info, but it's fairly safe to assume 322 * it's used solely for debug/informational purposes. The Linux driver 323 * updates these values frequently as bugs are fixed and features are 324 * added. 325 */ 326 #define ENA_DRV_VER_MAJOR 1 327 #define ENA_DRV_VER_MINOR 0 328 #define ENA_DRV_VER_SUBMINOR 0 329 330 uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; 331 332 /* 333 * Log an error message. We leave the destination (console or system 334 * log) up to the caller 335 */ 336 void 337 ena_err(const ena_t *ena, const char *fmt, ...) 338 { 339 va_list ap; 340 341 va_start(ap, fmt); 342 if (ena != NULL && ena->ena_dip != NULL) { 343 vdev_err(ena->ena_dip, CE_WARN, fmt, ap); 344 } else { 345 vcmn_err(CE_WARN, fmt, ap); 346 } 347 va_end(ap); 348 } 349 350 /* 351 * Set this to B_TRUE to enable debug messages. 352 */ 353 boolean_t ena_debug = B_FALSE; 354 355 /* 356 * Log a debug message. We force all debug messages to go to the 357 * system log. 358 */ 359 void 360 ena_dbg(const ena_t *ena, const char *fmt, ...) 361 { 362 va_list ap; 363 364 if (ena_debug) { 365 char msg[1024]; 366 367 va_start(ap, fmt); 368 (void) vsnprintf(msg, sizeof (msg), fmt, ap); 369 va_end(ap); 370 371 if (ena != NULL && ena->ena_dip != NULL) { 372 dev_err(ena->ena_dip, CE_NOTE, "!%s", msg); 373 } else { 374 cmn_err(CE_NOTE, "!%s", msg); 375 } 376 } 377 } 378 379 ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = { 380 { .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" }, 381 { .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" }, 382 { .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" }, 383 { 384 .eag_type = ENAHW_AENQ_GROUP_NOTIFICATION, 385 .eag_str = "NOTIFICATION" 386 }, 387 { .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" }, 388 { 389 .eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES, 390 .eag_str = "REFRESH CAPABILITIES" 391 }, 392 }; 393 394 void 395 ena_aenq_work(ena_t *ena) 396 { 397 ena_aenq_t *aenq = &ena->ena_aenq; 398 uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); 399 boolean_t processed = B_FALSE; 400 enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod]; 401 uint64_t ts; 402 403 ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low; 404 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL); 405 406 while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) { 407 ena_aenq_hdlr_t hdlr; 408 409 ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM); 410 processed = B_TRUE; 411 ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64 412 " us", desc->ead_group, 413 ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome, 414 ts); 415 416 hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group]; 417 hdlr(ena, desc); 418 419 aenq->eaenq_head++; 420 head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); 421 422 if (head_mod == 0) { 423 aenq->eaenq_phase = !aenq->eaenq_phase; 424 } 425 426 desc = &aenq->eaenq_descs[head_mod]; 427 } 428 429 if (processed) { 430 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, 431 aenq->eaenq_head); 432 } 433 } 434 435 /* 436 * Use for attach sequences which perform no resource allocation (or 437 * global state modification) and thus require no subsequent 438 * deallocation. 439 */ 440 static void 441 ena_no_cleanup(ena_t *ena) 442 { 443 } 444 445 static boolean_t 446 ena_attach_pci(ena_t *ena) 447 { 448 ddi_acc_handle_t hdl; 449 450 if (pci_config_setup(ena->ena_dip, &hdl) != 0) { 451 return (B_FALSE); 452 } 453 454 ena->ena_pci_hdl = hdl; 455 ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID); 456 ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID); 457 ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID); 458 ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID); 459 ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID); 460 ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x", 461 ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev, 462 ena->ena_pci_svid, ena->ena_pci_sdid); 463 464 return (B_TRUE); 465 } 466 467 static void 468 ena_cleanup_pci(ena_t *ena) 469 { 470 pci_config_teardown(&ena->ena_pci_hdl); 471 } 472 473 static void 474 ena_cleanup_regs_map(ena_t *ena) 475 { 476 ddi_regs_map_free(&ena->ena_reg_hdl); 477 } 478 479 static boolean_t 480 ena_attach_regs_map(ena_t *ena) 481 { 482 int ret = 0; 483 484 if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) != 485 DDI_SUCCESS) { 486 ena_err(ena, "failed to get register set %d size", 487 ENA_REG_NUMBER); 488 return (B_FALSE); 489 } 490 491 ena_dbg(ena, "register size: %ld", ena->ena_reg_size); 492 bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr)); 493 ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1; 494 ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC; 495 ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 496 497 /* 498 * This function can return several different failure values, 499 * so we make sure to capture its return value for the purpose 500 * of logging. 501 */ 502 ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER, 503 &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr, 504 &ena->ena_reg_hdl); 505 506 if (ret != DDI_SUCCESS) { 507 ena_err(ena, "failed to map register set %d: %d", 508 ENA_REG_NUMBER, ret); 509 return (B_FALSE); 510 } 511 512 ena_dbg(ena, "registers mapped to base: 0x%p", 513 (void *)ena->ena_reg_base); 514 515 return (B_TRUE); 516 } 517 518 /* 519 * Free any resources related to the admin submission queue. 520 */ 521 static void 522 ena_admin_sq_free(ena_t *ena) 523 { 524 ena_dma_free(&ena->ena_aq.ea_sq.eas_dma); 525 } 526 527 /* 528 * Initialize the admin submission queue. 529 */ 530 static boolean_t 531 ena_admin_sq_init(ena_t *ena) 532 { 533 ena_adminq_t *aq = &ena->ena_aq; 534 ena_dma_buf_t *dma = &aq->ea_sq.eas_dma; 535 size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries); 536 uint32_t addr_low, addr_high, wval; 537 ena_dma_conf_t conf = { 538 .edc_size = size, 539 .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT, 540 .edc_sgl = 1, 541 .edc_endian = DDI_NEVERSWAP_ACC, 542 .edc_stream = B_FALSE, 543 }; 544 545 if (!ena_dma_alloc(ena, dma, &conf, size)) { 546 ena_err(ena, "failed to allocate DMA for Admin SQ"); 547 return (B_FALSE); 548 } 549 550 aq->ea_sq.eas_entries = (void *)dma->edb_va; 551 aq->ea_sq.eas_tail = 0; 552 aq->ea_sq.eas_phase = 1; 553 aq->ea_sq.eas_dbaddr = 554 (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB); 555 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 556 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 557 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 558 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low); 559 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high); 560 wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) | 561 ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries)); 562 ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval); 563 return (B_TRUE); 564 } 565 566 /* 567 * Free any resources related to the admin completion queue. 568 */ 569 static void 570 ena_admin_cq_free(ena_t *ena) 571 { 572 ena_dma_free(&ena->ena_aq.ea_cq.eac_dma); 573 } 574 575 /* 576 * Initialize the admin completion queue. 577 */ 578 static boolean_t 579 ena_admin_cq_init(ena_t *ena) 580 { 581 ena_adminq_t *aq = &ena->ena_aq; 582 ena_dma_buf_t *dma = &aq->ea_cq.eac_dma; 583 size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries); 584 uint32_t addr_low, addr_high, wval; 585 ena_dma_conf_t conf = { 586 .edc_size = size, 587 .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT, 588 .edc_sgl = 1, 589 .edc_endian = DDI_NEVERSWAP_ACC, 590 .edc_stream = B_FALSE, 591 }; 592 593 if (!ena_dma_alloc(ena, dma, &conf, size)) { 594 ena_err(ena, "failed to allocate DMA for Admin CQ"); 595 return (B_FALSE); 596 } 597 598 aq->ea_cq.eac_entries = (void *)dma->edb_va; 599 aq->ea_cq.eac_head = 0; 600 aq->ea_cq.eac_phase = 1; 601 ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); 602 addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); 603 addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); 604 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low); 605 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high); 606 wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) | 607 ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries)); 608 ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval); 609 return (B_TRUE); 610 } 611 612 static void 613 ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc) 614 { 615 ena_t *ena = data; 616 617 ena->ena_aenq_stat.eaes_default.value.ui64++; 618 ena_dbg(ena, "unimplemented handler for aenq group: %s", 619 ena_groups_str[desc->ead_group].eag_str); 620 } 621 622 static void 623 ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc) 624 { 625 ena_t *ena = data; 626 boolean_t is_up = (desc->ead_payload.link_change.flags & 627 ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0; 628 629 /* 630 * The interupts are not enabled until after we register mac, 631 * so the mac handle should be valid. 632 */ 633 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER); 634 ena->ena_aenq_stat.eaes_link_change.value.ui64++; 635 636 mutex_enter(&ena->ena_lock); 637 638 /* 639 * Notify mac only on an actual change in status. 640 */ 641 if (ena->ena_link_up != is_up) { 642 if (is_up) { 643 mac_link_update(ena->ena_mh, LINK_STATE_UP); 644 } else { 645 mac_link_update(ena->ena_mh, LINK_STATE_DOWN); 646 } 647 } 648 649 ena->ena_link_up = is_up; 650 651 mutex_exit(&ena->ena_lock); 652 } 653 654 /* 655 * Free any resources related to the Async Event Notification Queue. 656 */ 657 static void 658 ena_aenq_free(ena_t *ena) 659 { 660 ena_dma_free(&ena->ena_aenq.eaenq_dma); 661 } 662 663 static void 664 ena_aenq_set_def_hdlrs(ena_aenq_t *aenq) 665 { 666 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr; 667 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr; 668 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr; 669 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] = 670 ena_aenq_default_hdlr; 671 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr; 672 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] = 673 ena_aenq_default_hdlr; 674 } 675 /* 676 * Initialize the Async Event Notification Queue. 677 */ 678 static boolean_t 679 ena_aenq_init(ena_t *ena) 680 { 681 ena_aenq_t *aenq = &ena->ena_aenq; 682 size_t size; 683 uint32_t addr_low, addr_high, wval; 684 ena_dma_conf_t conf; 685 686 aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS; 687 size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs); 688 689 conf = (ena_dma_conf_t) { 690 .edc_size = size, 691 .edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT, 692 .edc_sgl = 1, 693 .edc_endian = DDI_NEVERSWAP_ACC, 694 .edc_stream = B_FALSE, 695 }; 696 697 if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) { 698 ena_err(ena, "failed to allocate DMA for AENQ"); 699 return (B_FALSE); 700 } 701 702 aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va; 703 aenq->eaenq_head = 0; 704 aenq->eaenq_phase = 1; 705 bzero(aenq->eaenq_descs, size); 706 ena_aenq_set_def_hdlrs(aenq); 707 708 aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = 709 ena_aenq_link_change_hdlr; 710 711 ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress); 712 addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress); 713 addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32); 714 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low); 715 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high); 716 ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV); 717 wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) | 718 ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs)); 719 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval); 720 return (B_TRUE); 721 } 722 723 /* 724 * We limit the max number of I/O queues based on several aspects of 725 * the underlying hardware. 726 * 727 * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES, 728 * which comes from the common code and presumably is based on device 729 * constraints. 730 * 731 * 2. Next we latch the number of I/O queues to the number of online 732 * CPUs. The idea being that each queue is a parallel work stream, 733 * and having more queues than CPUs to flush them will not improve 734 * performance. The number of online CPUs can change dynamically, 735 * and that's okay, everything should still work fine, it just 736 * might not be ideal. 737 * 738 * 3. Next we latch the number of I/O queues to the smallest of the 739 * max Tx queues and max Rx queues. We could probably loosen this 740 * restriction in the future, and have separate max I/O queues for 741 * Tx and Rx. This is what Linux does, and seems like a fine place 742 * to start. 743 */ 744 static void 745 ena_set_max_io_queues(ena_t *ena) 746 { 747 uint32_t max = ENAHW_MAX_NUM_IO_QUEUES; 748 749 max = MIN(ncpus_online, max); 750 /* 751 * Supposedly a device could present a different number of SQs 752 * and CQs. This driver is desinged in a way that requires 753 * each SQ to have a corresponding and dedicated CQ (how would 754 * it work otherwise). Therefore, we must check both values 755 * and find the minimum between them. 756 */ 757 max = MIN(ena->ena_tx_max_sq_num, max); 758 max = MIN(ena->ena_tx_max_cq_num, max); 759 max = MIN(ena->ena_rx_max_sq_num, max); 760 max = MIN(ena->ena_rx_max_cq_num, max); 761 762 763 /* This shouldn't happen, but just in case. */ 764 if (max == 0) { 765 max = 1; 766 } 767 768 ena->ena_max_io_queues = max; 769 } 770 771 /* 772 * We require that an Rx or Tx buffer be able to hold the maximum MTU 773 * along with the maximum frame header length. In this case we know 774 * ENA is presenting us an Ethernet frame so we add the size of an 775 * Ethernet VLAN header. Rx has the additional requirement of needing 776 * additional margin for the sake of IP header alignment. 777 */ 778 static void 779 ena_update_buf_sizes(ena_t *ena) 780 { 781 ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header); 782 ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu; 783 ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total, 784 ena->ena_page_sz, uint32_t); 785 ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total + 786 ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t); 787 } 788 789 static boolean_t 790 ena_get_offloads(ena_t *ena) 791 { 792 int ret = 0; 793 enahw_resp_desc_t resp; 794 enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload; 795 796 ena->ena_tx_l3_ipv4_csum = B_FALSE; 797 798 ena->ena_tx_l4_ipv4_part_csum = B_FALSE; 799 ena->ena_tx_l4_ipv4_full_csum = B_FALSE; 800 ena->ena_tx_l4_ipv4_lso = B_FALSE; 801 802 ena->ena_tx_l4_ipv6_part_csum = B_FALSE; 803 ena->ena_tx_l4_ipv6_full_csum = B_FALSE; 804 ena->ena_tx_l4_ipv6_lso = B_FALSE; 805 806 ena->ena_rx_l3_ipv4_csum = B_FALSE; 807 ena->ena_rx_l4_ipv4_csum = B_FALSE; 808 ena->ena_rx_l4_ipv6_csum = B_FALSE; 809 ena->ena_rx_hash = B_FALSE; 810 811 bzero(&resp, sizeof (resp)); 812 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG, 813 ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER); 814 815 if (ret == ENOTSUP) { 816 /* 817 * In this case the device does not support querying 818 * for hardware offloads. We take that as a sign that 819 * the device provides no offloads. 820 */ 821 return (B_TRUE); 822 } else if (ret != 0) { 823 ena_err(ena, "error getting stateless offload: %d", ret); 824 return (B_FALSE); 825 } 826 827 ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat); 828 829 ena->ena_tx_l4_ipv4_part_csum = 830 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat); 831 ena->ena_tx_l4_ipv4_full_csum = 832 ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat); 833 ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat); 834 835 ena->ena_tx_l4_ipv6_part_csum = 836 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat); 837 ena->ena_tx_l4_ipv6_full_csum = 838 ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat); 839 ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat); 840 841 ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat); 842 ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat); 843 ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat); 844 return (B_TRUE); 845 } 846 847 static int 848 ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval, 849 const int defval) 850 { 851 int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip, 852 DDI_PROP_DONTPASS, propname, defval); 853 854 if (value > maxval) { 855 ena_err(ena, "user value %s=%d exceeded maximum, setting to %d", 856 propname, value, maxval); 857 value = maxval; 858 } 859 860 if (value < minval) { 861 ena_err(ena, "user value %s=%d below minimum, setting to %d", 862 propname, value, minval); 863 value = minval; 864 } 865 866 return (value); 867 } 868 869 static boolean_t 870 ena_set_mtu(ena_t *ena) 871 { 872 int ret = 0; 873 enahw_cmd_desc_t cmd; 874 enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu; 875 enahw_resp_desc_t resp; 876 877 bzero(&cmd, sizeof (cmd)); 878 bzero(&resp, sizeof (resp)); 879 feat->efm_mtu = ena->ena_mtu; 880 881 if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU, 882 ENAHW_FEAT_MTU_VER)) != 0) { 883 ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu, 884 ret); 885 return (B_FALSE); 886 } 887 888 return (B_TRUE); 889 } 890 891 static void 892 ena_get_link_config(ena_t *ena) 893 { 894 enahw_resp_desc_t resp; 895 enahw_feat_link_conf_t *feat = 896 &resp.erd_resp.erd_get_feat.ergf_link_conf; 897 boolean_t full_duplex; 898 899 bzero(&resp, sizeof (resp)); 900 901 if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG, 902 ENAHW_FEAT_LINK_CONFIG_VER) != 0) { 903 /* 904 * Some ENA devices do no support this feature. In 905 * those cases we report a 1Gbps link, full duplex. 906 * For the most accurate information on bandwidth 907 * limits see the official AWS documentation. 908 */ 909 ena->ena_link_speed_mbits = 1 * 1000 * 1000; 910 ena->ena_link_speeds = ENAHW_LINK_SPEED_1G; 911 ena->ena_link_duplex = LINK_DUPLEX_FULL; 912 ena->ena_link_autoneg = B_TRUE; 913 return; 914 } 915 916 ena->ena_link_speed_mbits = feat->eflc_speed; 917 ena->ena_link_speeds = feat->eflc_supported; 918 full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat); 919 ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL : 920 LINK_DUPLEX_HALF; 921 ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat); 922 } 923 924 /* 925 * Retrieve all configuration values which are modifiable via 926 * ena.conf, and set ena_t members accordingly. While the conf values 927 * have priority, they may be implicitly modified by the driver to 928 * meet resource constraints on a given platform. If no value is 929 * specified in the conf file, the driver will attempt to use the 930 * largest value supported. While there should be no value large 931 * enough, keep in mind that ena_get_prop() will cast the values to an 932 * int. 933 * 934 * This function should be called after the device is initialized, 935 * admin queue is established, and the hardware features/capabs have 936 * been queried; it should be called before mac registration. 937 */ 938 static boolean_t 939 ena_attach_read_conf(ena_t *ena) 940 { 941 uint32_t gcv; /* Greatest Common Value */ 942 943 /* 944 * We expect that the queue lengths are the same for both the 945 * CQ and SQ, but technically the device could return 946 * different lengths. For now the driver locks them together. 947 */ 948 gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs); 949 ASSERT3U(gcv, <=, INT_MAX); 950 ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS, 951 ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv); 952 953 ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT, 954 ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX, 955 ENA_PROP_RXQ_INTR_LIMIT_DEF); 956 957 gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs); 958 ASSERT3U(gcv, <=, INT_MAX); 959 ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS, 960 ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv); 961 962 return (B_TRUE); 963 } 964 965 /* 966 * Perform any necessary device configuration after the driver.conf 967 * has been read. 968 */ 969 static boolean_t 970 ena_attach_dev_cfg(ena_t *ena) 971 { 972 ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF); 973 974 if (!ena_set_mtu(ena)) { 975 /* 976 * We don't expect this to fail, but we try a fallback 977 * first before failing the attach sequence. 978 */ 979 ena->ena_mtu = 1500; 980 ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu); 981 982 if (!ena_set_mtu(ena)) { 983 return (B_FALSE); 984 } 985 } 986 987 return (B_TRUE); 988 } 989 990 static boolean_t 991 ena_check_versions(ena_t *ena) 992 { 993 uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION); 994 uint32_t ctrl_vsn = 995 ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION); 996 997 ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn); 998 ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn); 999 1000 ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn); 1001 ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn); 1002 ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn); 1003 ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn); 1004 1005 if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) { 1006 ena_err(ena, "unsupported controller version: %u.%u.%u", 1007 ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, 1008 ena->ena_ctrl_subminor_vsn); 1009 return (B_FALSE); 1010 } 1011 1012 return (B_TRUE); 1013 } 1014 1015 boolean_t 1016 ena_setup_aenq(ena_t *ena) 1017 { 1018 enahw_cmd_desc_t cmd; 1019 enahw_feat_aenq_t *cmd_feat = 1020 &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq; 1021 enahw_resp_desc_t resp; 1022 enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq; 1023 enahw_aenq_groups_t to_enable; 1024 1025 bzero(&resp, sizeof (resp)); 1026 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, 1027 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1028 return (B_FALSE); 1029 } 1030 1031 to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) | 1032 BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) | 1033 BIT(ENAHW_AENQ_GROUP_WARNING) | 1034 BIT(ENAHW_AENQ_GROUP_NOTIFICATION); 1035 to_enable &= resp_feat->efa_supported_groups; 1036 1037 bzero(&cmd, sizeof (cmd)); 1038 bzero(&resp, sizeof (cmd)); 1039 cmd_feat->efa_enabled_groups = to_enable; 1040 1041 if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG, 1042 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1043 return (B_FALSE); 1044 } 1045 1046 bzero(&resp, sizeof (resp)); 1047 if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, 1048 ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { 1049 return (B_FALSE); 1050 } 1051 1052 ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups; 1053 ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups; 1054 1055 for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) { 1056 ena_aenq_grpstr_t *grpstr = &ena_groups_str[i]; 1057 boolean_t supported = BIT(grpstr->eag_type) & 1058 resp_feat->efa_supported_groups; 1059 boolean_t enabled = BIT(grpstr->eag_type) & 1060 resp_feat->efa_enabled_groups; 1061 1062 ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str, 1063 supported ? "Y" : "N", enabled ? "Y" : "N"); 1064 } 1065 1066 return (B_TRUE); 1067 } 1068 1069 /* 1070 * Free all resources allocated as part of ena_device_init(). 1071 */ 1072 static void 1073 ena_cleanup_device_init(ena_t *ena) 1074 { 1075 ena_adminq_t *aq = &ena->ena_aq; 1076 1077 ena_free_host_info(ena); 1078 mutex_destroy(&aq->ea_sq_lock); 1079 mutex_destroy(&aq->ea_cq_lock); 1080 mutex_destroy(&aq->ea_stat_lock); 1081 list_destroy(&aq->ea_cmd_ctxs_free); 1082 kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen); 1083 ena_admin_sq_free(ena); 1084 ena_admin_cq_free(ena); 1085 ena_aenq_free(ena); 1086 ena_stat_device_basic_cleanup(ena); 1087 ena_stat_device_extended_cleanup(ena); 1088 ena_stat_aenq_cleanup(ena); 1089 } 1090 1091 static boolean_t 1092 ena_attach_device_init(ena_t *ena) 1093 { 1094 ena_adminq_t *aq = &ena->ena_aq; 1095 uint32_t rval, wval; 1096 uint8_t dma_width; 1097 hrtime_t timeout, cmd_timeout; 1098 hrtime_t expired; 1099 enahw_resp_desc_t resp; 1100 enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr; 1101 uint8_t *maddr; 1102 uint32_t supported_features; 1103 int ret = 0; 1104 1105 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1106 if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) { 1107 ena_err(ena, "device is not ready"); 1108 return (B_FALSE); 1109 } 1110 1111 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 1112 1113 /* 1114 * The device stores the reset timeout at 100ms resolution; we 1115 * normalize that to nanoseconds. 1116 */ 1117 timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100); 1118 1119 if (timeout == 0) { 1120 ena_err(ena, "device gave invalid reset timeout"); 1121 return (B_FALSE); 1122 } 1123 1124 expired = gethrtime() + timeout; 1125 1126 wval = ENAHW_DEV_CTL_DEV_RESET_MASK; 1127 wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) & 1128 ENAHW_DEV_CTL_RESET_REASON_MASK; 1129 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval); 1130 1131 /* 1132 * Make sure reset is in progress. 1133 */ 1134 while (1) { 1135 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1136 1137 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) { 1138 break; 1139 } 1140 1141 if (gethrtime() > expired) { 1142 ena_err(ena, "device reset start timed out"); 1143 return (B_FALSE); 1144 } 1145 1146 /* Sleep for 100 milliseconds. */ 1147 delay(drv_usectohz(100 * 1000)); 1148 } 1149 1150 /* 1151 * Reset the timeout counter for the next device request. 1152 */ 1153 expired = gethrtime() + timeout; 1154 1155 /* 1156 * Wait for the device reset to finish. 1157 */ 1158 ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0); 1159 while (1) { 1160 rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 1161 1162 if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) { 1163 break; 1164 } 1165 1166 if (gethrtime() > expired) { 1167 ena_err(ena, "device reset timed out"); 1168 return (B_FALSE); 1169 } 1170 1171 /* Sleep for 100 milliseconds. */ 1172 delay(drv_usectohz(100 * 1000)); 1173 } 1174 1175 if (!ena_check_versions(ena)) { 1176 return (B_FALSE); 1177 } 1178 1179 rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); 1180 dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval); 1181 ena->ena_dma_width = dma_width; 1182 1183 /* 1184 * As we are not using an interrupt for admin queue completion 1185 * signaling, we do not need a priority on these mutexes. If 1186 * that changes, we will have to rejigger some code to create 1187 * the admin queue interrupt before this function. 1188 */ 1189 mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL); 1190 mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL); 1191 mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL); 1192 aq->ea_qlen = ENA_ADMINQ_DEPTH; 1193 aq->ea_pending_cmds = 0; 1194 1195 aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen, 1196 KM_SLEEP); 1197 list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t), 1198 offsetof(ena_cmd_ctx_t, ectx_node)); 1199 1200 for (uint_t i = 0; i < aq->ea_qlen; i++) { 1201 ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i]; 1202 1203 ctx->ectx_id = i; 1204 ctx->ectx_pending = B_FALSE; 1205 ctx->ectx_cmd_opcode = ENAHW_CMD_NONE; 1206 ctx->ectx_resp = NULL; 1207 list_insert_tail(&aq->ea_cmd_ctxs_free, ctx); 1208 } 1209 1210 /* 1211 * The value stored in the device register is in the 1212 * resolution of 100 milliseconds. We normalize that to 1213 * nanoseconds. 1214 */ 1215 cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100); 1216 aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns); 1217 1218 if (aq->ea_cmd_timeout_ns == 0) { 1219 aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; 1220 } 1221 1222 if (!ena_admin_sq_init(ena)) { 1223 return (B_FALSE); 1224 } 1225 1226 if (!ena_admin_cq_init(ena)) { 1227 return (B_FALSE); 1228 } 1229 1230 if (!ena_aenq_init(ena)) { 1231 return (B_FALSE); 1232 } 1233 1234 /* 1235 * While the Linux driver prefers to use interrupts to deliver 1236 * admin queue completions, we just poll -- it seems to work 1237 * just fine. 1238 */ 1239 ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, 0); 1240 aq->ea_poll_mode = B_TRUE; 1241 1242 bzero(&resp, sizeof (resp)); 1243 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES, 1244 ENAHW_FEAT_DEVICE_ATTRIBUTES_VER); 1245 1246 if (ret != 0) { 1247 ena_err(ena, "failed to get device attributes: %d", ret); 1248 return (B_FALSE); 1249 } 1250 1251 ena_dbg(ena, "impl ID: %u", feat->efda_impl_id); 1252 ena_dbg(ena, "device version: %u", feat->efda_device_version); 1253 ena_dbg(ena, "supported features: 0x%x", 1254 feat->efda_supported_features); 1255 ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width); 1256 ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with); 1257 maddr = feat->efda_mac_addr; 1258 ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1], 1259 maddr[2], maddr[3], maddr[4], maddr[5]); 1260 ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu); 1261 1262 bcopy(maddr, ena->ena_mac_addr, ETHERADDRL); 1263 ena->ena_max_mtu = feat->efda_max_mtu; 1264 supported_features = feat->efda_supported_features; 1265 ena->ena_supported_features = supported_features; 1266 feat = NULL; 1267 bzero(&resp, sizeof (resp)); 1268 1269 if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) { 1270 enahw_feat_max_queue_ext_t *feat_mqe = 1271 &resp.erd_resp.erd_get_feat.ergf_max_queue_ext; 1272 1273 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT, 1274 ENAHW_FEAT_MAX_QUEUES_EXT_VER); 1275 1276 if (ret != 0) { 1277 ena_err(ena, "failed to query max queues ext: %d", ret); 1278 return (B_FALSE); 1279 } 1280 1281 ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num; 1282 ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth; 1283 ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num; 1284 ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth; 1285 ena->ena_tx_max_desc_per_pkt = 1286 feat_mqe->efmqe_max_per_packet_tx_descs; 1287 ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size; 1288 1289 ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num; 1290 ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth; 1291 ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num; 1292 ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth; 1293 ena->ena_rx_max_desc_per_pkt = 1294 feat_mqe->efmqe_max_per_packet_rx_descs; 1295 1296 ena_set_max_io_queues(ena); 1297 } else { 1298 enahw_feat_max_queue_t *feat_mq = 1299 &resp.erd_resp.erd_get_feat.ergf_max_queue; 1300 1301 ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM, 1302 ENAHW_FEAT_MAX_QUEUES_NUM_VER); 1303 1304 if (ret != 0) { 1305 ena_err(ena, "failed to query max queues: %d", ret); 1306 return (B_FALSE); 1307 } 1308 1309 ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num; 1310 ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1311 ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num; 1312 ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1313 ena->ena_tx_max_desc_per_pkt = 1314 feat_mq->efmq_max_per_packet_tx_descs; 1315 ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size; 1316 1317 ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num; 1318 ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; 1319 ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num; 1320 ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; 1321 ena->ena_rx_max_desc_per_pkt = 1322 feat_mq->efmq_max_per_packet_rx_descs; 1323 1324 ena_set_max_io_queues(ena); 1325 } 1326 1327 ena->ena_mtu = ena->ena_max_mtu; 1328 ena_update_buf_sizes(ena); 1329 /* 1330 * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL 1331 * sizes, for now we just force everything to use one 1332 * segment. 1333 */ 1334 ena->ena_tx_sgl_max_sz = 1; 1335 ena->ena_rx_sgl_max_sz = 1; 1336 1337 if (!ena_init_host_info(ena)) { 1338 return (B_FALSE); 1339 } 1340 1341 if (!ena_setup_aenq(ena)) { 1342 return (B_FALSE); 1343 } 1344 1345 ena_get_link_config(ena); 1346 1347 if (!ena_get_offloads(ena)) { 1348 return (B_FALSE); 1349 } 1350 1351 if (!ena_stat_device_basic_init(ena)) { 1352 return (B_FALSE); 1353 } 1354 1355 if (!ena_stat_device_extended_init(ena)) { 1356 return (B_FALSE); 1357 } 1358 1359 if (!ena_stat_aenq_init(ena)) { 1360 return (B_FALSE); 1361 } 1362 1363 return (B_TRUE); 1364 } 1365 1366 static void 1367 ena_cleanup_intr_alloc(ena_t *ena) 1368 { 1369 for (int i = 0; i < ena->ena_num_intrs; i++) { 1370 int ret = ddi_intr_free(ena->ena_intr_handles[i]); 1371 if (ret != DDI_SUCCESS) { 1372 ena_err(ena, "failed to free interrupt %d: %d", i, ret); 1373 } 1374 } 1375 1376 if (ena->ena_intr_handles != NULL) { 1377 kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz); 1378 ena->ena_intr_handles = NULL; 1379 ena->ena_intr_handles_sz = 0; 1380 } 1381 } 1382 1383 /* 1384 * The Linux driver supports only MSI-X interrupts. We do the same, 1385 * with the assumption that it's the only type of interrupt the device 1386 * can present. 1387 */ 1388 static boolean_t 1389 ena_attach_intr_alloc(ena_t *ena) 1390 { 1391 int ret; 1392 int types; 1393 int min, req, ideal, avail, actual; 1394 1395 ret = ddi_intr_get_supported_types(ena->ena_dip, &types); 1396 if (ret != DDI_SUCCESS) { 1397 ena_err(ena, "failed to get interrupt types: %d", ret); 1398 return (B_FALSE); 1399 } 1400 1401 ena_dbg(ena, "supported interrupt types: 0x%x", types); 1402 if ((types & DDI_INTR_TYPE_MSIX) == 0) { 1403 ena_err(ena, "the ena driver only supports MSI-X interrupts"); 1404 return (B_FALSE); 1405 } 1406 1407 /* One for I/O, one for adminq. */ 1408 min = 2; 1409 ideal = ena->ena_max_io_queues + 1; 1410 ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1411 if (ret != DDI_SUCCESS) { 1412 ena_err(ena, "failed to get number of MSI-X interrupts: %d", 1413 ret); 1414 return (B_FALSE); 1415 } 1416 1417 if (avail < min) { 1418 ena_err(ena, "number of MSI-X interrupts is %d, but the driver " 1419 "requires a minimum of %d", avail, min); 1420 return (B_FALSE); 1421 } 1422 1423 ena_dbg(ena, "%d MSI-X interrupts available", avail); 1424 1425 ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); 1426 if (ret != DDI_SUCCESS) { 1427 ena_err(ena, "failed to get available interrupts: %d", ret); 1428 return (B_FALSE); 1429 } 1430 1431 if (avail < min) { 1432 ena_err(ena, "number of available MSI-X interrupts is %d, " 1433 "but the driver requires a minimum of %d", avail, min); 1434 return (B_FALSE); 1435 } 1436 1437 req = MIN(ideal, avail); 1438 ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t); 1439 ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP); 1440 1441 ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles, 1442 DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL); 1443 if (ret != DDI_SUCCESS) { 1444 ena_err(ena, "failed to allocate %d MSI-X interrupts: %d", 1445 req, ret); 1446 return (B_FALSE); 1447 } 1448 1449 if (actual < min) { 1450 ena_err(ena, "number of allocated interrupts is %d, but the " 1451 "driver requires a minimum of %d", actual, min); 1452 return (B_FALSE); 1453 } 1454 1455 ena->ena_num_intrs = actual; 1456 1457 ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps); 1458 if (ret != DDI_SUCCESS) { 1459 ena_err(ena, "failed to get interrupt capability: %d", ret); 1460 return (B_FALSE); 1461 } 1462 1463 ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri); 1464 if (ret != DDI_SUCCESS) { 1465 ena_err(ena, "failed to get interrupt priority: %d", ret); 1466 return (B_FALSE); 1467 } 1468 1469 ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u", 1470 actual, ena->ena_intr_caps, ena->ena_intr_pri); 1471 1472 /* 1473 * The ena_lock should not be held in the datapath, but it is 1474 * held as part of the AENQ handler, which runs in interrupt 1475 * context. Therefore, we delayed the initilization of this 1476 * mutex until after the interrupts are allocated. 1477 */ 1478 mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER, 1479 DDI_INTR_PRI(ena->ena_intr_pri)); 1480 1481 return (B_TRUE); 1482 } 1483 1484 /* 1485 * Allocate the parent Rx queue structures. More importantly, this is 1486 * NOT allocating the queue descriptors or data buffers. Those are 1487 * allocated on demand as queues are started. 1488 */ 1489 static boolean_t 1490 ena_attach_alloc_rxqs(ena_t *ena) 1491 { 1492 /* We rely on the interrupt priority for initializing the mutexes. */ 1493 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1494 ena->ena_num_rxqs = ena->ena_num_intrs - 1; 1495 ASSERT3U(ena->ena_num_rxqs, >, 0); 1496 ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), 1497 KM_SLEEP); 1498 1499 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1500 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1501 1502 rxq->er_rxqs_idx = i; 1503 /* The 0th vector is for Admin + AENQ. */ 1504 rxq->er_intr_vector = i + 1; 1505 rxq->er_mrh = NULL; 1506 1507 mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER, 1508 DDI_INTR_PRI(ena->ena_intr_pri)); 1509 mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER, 1510 DDI_INTR_PRI(ena->ena_intr_pri)); 1511 1512 rxq->er_ena = ena; 1513 rxq->er_sq_num_descs = ena->ena_rxq_num_descs; 1514 rxq->er_cq_num_descs = ena->ena_rxq_num_descs; 1515 1516 if (!ena_stat_rxq_init(rxq)) { 1517 return (B_FALSE); 1518 } 1519 1520 if (!ena_alloc_rxq(rxq)) { 1521 return (B_FALSE); 1522 } 1523 } 1524 1525 return (B_TRUE); 1526 } 1527 1528 static void 1529 ena_cleanup_rxqs(ena_t *ena) 1530 { 1531 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1532 ena_rxq_t *rxq = &ena->ena_rxqs[i]; 1533 1534 ena_cleanup_rxq(rxq); 1535 mutex_destroy(&rxq->er_lock); 1536 mutex_destroy(&rxq->er_stat_lock); 1537 ena_stat_rxq_cleanup(rxq); 1538 } 1539 1540 kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs)); 1541 } 1542 1543 /* 1544 * Allocate the parent Tx queue structures. More importantly, this is 1545 * NOT allocating the queue descriptors or data buffers. Those are 1546 * allocated on demand as a queue is started. 1547 */ 1548 static boolean_t 1549 ena_attach_alloc_txqs(ena_t *ena) 1550 { 1551 /* We rely on the interrupt priority for initializing the mutexes. */ 1552 VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); 1553 ena->ena_num_txqs = ena->ena_num_intrs - 1; 1554 ASSERT3U(ena->ena_num_txqs, >, 0); 1555 ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs), 1556 KM_SLEEP); 1557 1558 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 1559 ena_txq_t *txq = &ena->ena_txqs[i]; 1560 1561 txq->et_txqs_idx = i; 1562 /* The 0th vector is for Admin + AENQ. */ 1563 txq->et_intr_vector = i + 1; 1564 txq->et_mrh = NULL; 1565 1566 mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER, 1567 DDI_INTR_PRI(ena->ena_intr_pri)); 1568 mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER, 1569 DDI_INTR_PRI(ena->ena_intr_pri)); 1570 1571 txq->et_ena = ena; 1572 txq->et_sq_num_descs = ena->ena_txq_num_descs; 1573 txq->et_cq_num_descs = ena->ena_txq_num_descs; 1574 1575 if (!ena_stat_txq_init(txq)) { 1576 return (B_FALSE); 1577 } 1578 1579 if (!ena_alloc_txq(txq)) { 1580 return (B_FALSE); 1581 } 1582 } 1583 1584 return (B_TRUE); 1585 } 1586 1587 static void 1588 ena_cleanup_txqs(ena_t *ena) 1589 { 1590 for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { 1591 ena_txq_t *txq = &ena->ena_txqs[i]; 1592 1593 ena_cleanup_txq(txq); 1594 mutex_destroy(&txq->et_lock); 1595 mutex_destroy(&txq->et_stat_lock); 1596 ena_stat_txq_cleanup(txq); 1597 } 1598 1599 kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs)); 1600 } 1601 1602 ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = { 1603 { 1604 .ead_seq = ENA_ATTACH_PCI, 1605 .ead_name = "PCI config", 1606 .ead_attach_fn = ena_attach_pci, 1607 .ead_attach_hard_fail = B_TRUE, 1608 .ead_cleanup_fn = ena_cleanup_pci, 1609 }, 1610 1611 { 1612 .ead_seq = ENA_ATTACH_REGS, 1613 .ead_name = "BAR mapping", 1614 .ead_attach_fn = ena_attach_regs_map, 1615 .ead_attach_hard_fail = B_TRUE, 1616 .ead_cleanup_fn = ena_cleanup_regs_map, 1617 }, 1618 1619 { 1620 .ead_seq = ENA_ATTACH_DEV_INIT, 1621 .ead_name = "device initialization", 1622 .ead_attach_fn = ena_attach_device_init, 1623 .ead_attach_hard_fail = B_TRUE, 1624 .ead_cleanup_fn = ena_cleanup_device_init, 1625 }, 1626 1627 { 1628 .ead_seq = ENA_ATTACH_READ_CONF, 1629 .ead_name = "ena.conf", 1630 .ead_attach_fn = ena_attach_read_conf, 1631 .ead_attach_hard_fail = B_TRUE, 1632 .ead_cleanup_fn = ena_no_cleanup, 1633 }, 1634 1635 { 1636 .ead_seq = ENA_ATTACH_DEV_CFG, 1637 .ead_name = "device config", 1638 .ead_attach_fn = ena_attach_dev_cfg, 1639 .ead_attach_hard_fail = B_TRUE, 1640 .ead_cleanup_fn = ena_no_cleanup, 1641 }, 1642 1643 { 1644 .ead_seq = ENA_ATTACH_INTR_ALLOC, 1645 .ead_name = "interrupt allocation", 1646 .ead_attach_fn = ena_attach_intr_alloc, 1647 .ead_attach_hard_fail = B_TRUE, 1648 .ead_cleanup_fn = ena_cleanup_intr_alloc, 1649 }, 1650 1651 { 1652 .ead_seq = ENA_ATTACH_INTR_HDLRS, 1653 .ead_name = "interrupt handlers", 1654 .ead_attach_fn = ena_intr_add_handlers, 1655 .ead_attach_hard_fail = B_TRUE, 1656 .ead_cleanup_fn = ena_intr_remove_handlers, 1657 }, 1658 1659 { 1660 .ead_seq = ENA_ATTACH_TXQS_ALLOC, 1661 .ead_name = "Tx queues", 1662 .ead_attach_fn = ena_attach_alloc_txqs, 1663 .ead_attach_hard_fail = B_TRUE, 1664 .ead_cleanup_fn = ena_cleanup_txqs, 1665 }, 1666 1667 { 1668 .ead_seq = ENA_ATTACH_RXQS_ALLOC, 1669 .ead_name = "Rx queues", 1670 .ead_attach_fn = ena_attach_alloc_rxqs, 1671 .ead_attach_hard_fail = B_TRUE, 1672 .ead_cleanup_fn = ena_cleanup_rxqs, 1673 }, 1674 1675 /* 1676 * The chance of mac_unregister() failure poses a problem to 1677 * cleanup. We address interrupt disablement and mac 1678 * unregistration explicitly in the attach/detach routines. 1679 */ 1680 { 1681 .ead_seq = ENA_ATTACH_MAC_REGISTER, 1682 .ead_name = "mac registration", 1683 .ead_attach_fn = ena_mac_register, 1684 .ead_attach_hard_fail = B_TRUE, 1685 .ead_cleanup_fn = ena_no_cleanup, 1686 }, 1687 1688 { 1689 .ead_seq = ENA_ATTACH_INTRS_ENABLE, 1690 .ead_name = "enable interrupts", 1691 .ead_attach_fn = ena_intrs_enable, 1692 .ead_attach_hard_fail = B_TRUE, 1693 .ead_cleanup_fn = ena_no_cleanup, 1694 } 1695 }; 1696 1697 /* 1698 * This function undoes any work done by ena_attach(), either in 1699 * response to a failed attach or a planned detach. At the end of this 1700 * function ena_attach_seq should be zero, otherwise it means 1701 * something has not be freed/uninitialized. 1702 */ 1703 static void 1704 ena_cleanup(ena_t *ena) 1705 { 1706 if (ena == NULL || ena->ena_attach_seq == 0) { 1707 return; 1708 } 1709 1710 /* 1711 * We VERIFY this because if the seq is greater than entries 1712 * we drift into space and execute god knows what. 1713 */ 1714 VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES); 1715 1716 while (ena->ena_attach_seq > 0) { 1717 int idx = ena->ena_attach_seq - 1; 1718 ena_attach_desc_t *desc = &ena_attach_tbl[idx]; 1719 1720 ena_dbg(ena, "running cleanup sequence: %s (%d)", 1721 desc->ead_name, idx); 1722 1723 desc->ead_cleanup_fn(ena); 1724 ena->ena_attach_seq--; 1725 } 1726 1727 ASSERT3U(ena->ena_attach_seq, ==, 0); 1728 mutex_destroy(&ena->ena_lock); 1729 } 1730 1731 static int 1732 ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1733 { 1734 ena_t *ena; 1735 1736 if (cmd != DDI_ATTACH) { 1737 return (DDI_FAILURE); 1738 } 1739 1740 ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP); 1741 ena->ena_instance = ddi_get_instance(dip); 1742 ena->ena_dip = dip; 1743 ena->ena_instance = ddi_get_instance(dip); 1744 ena->ena_page_sz = ddi_ptob(dip, 1); 1745 1746 for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) { 1747 boolean_t success; 1748 ena_attach_desc_t *desc = &ena_attach_tbl[i]; 1749 1750 ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name, 1751 i); 1752 1753 if (!(success = desc->ead_attach_fn(ena))) { 1754 ena_err(ena, "attach sequence failed: %s (%d)", 1755 desc->ead_name, i); 1756 1757 if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) { 1758 /* 1759 * In this specific case 1760 * ENA_ATTACH_INTRS_ENABLE has failed, 1761 * and we may or may not be able to 1762 * unregister the mac, depending on if 1763 * something in userspace has created 1764 * a client on top. 1765 * 1766 * NOTE: Something that would be nice 1767 * to add to mac is the ability to 1768 * register a provider separate from 1769 * "publishing" it to the rest of the 1770 * system. This would allow a driver 1771 * to register its mac, do some 1772 * additional work that might fail, 1773 * and then unregister if that work 1774 * fails without concern for any 1775 * chance of failure when calling 1776 * unregister. This would remove the 1777 * complexity of the situation we are 1778 * trying to address here, as we would 1779 * know that until the mac has been 1780 * "published", there is no chance for 1781 * mac_unregister() to fail. 1782 */ 1783 if (ena_mac_unregister(ena) != 0) { 1784 return (DDI_FAILURE); 1785 } 1786 1787 ena->ena_attach_seq--; 1788 } else { 1789 /* 1790 * Since the ead_seq is predicated on 1791 * successful ead_attach_fn we must 1792 * run the specific cleanup handler 1793 * before calling the global cleanup 1794 * routine. This also means that all 1795 * cleanup functions must be able to 1796 * deal with partial success of the 1797 * corresponding ead_attach_fn. 1798 */ 1799 desc->ead_cleanup_fn(ena); 1800 } 1801 1802 ena_cleanup(ena); 1803 kmem_free(ena, sizeof (ena_t)); 1804 return (DDI_FAILURE); 1805 } 1806 1807 if (success) { 1808 ena_dbg(ena, "attach sequence completed: %s (%d)", 1809 desc->ead_name, i); 1810 } 1811 1812 ena->ena_attach_seq = desc->ead_seq; 1813 } 1814 1815 /* 1816 * Now that interrupts are enabled make sure to tell the 1817 * device that all AENQ descriptors are ready for writing. 1818 */ 1819 ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, 1820 ena->ena_aenq.eaenq_num_descs); 1821 1822 ddi_set_driver_private(dip, ena); 1823 return (DDI_SUCCESS); 1824 } 1825 1826 static int 1827 ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1828 { 1829 ena_t *ena = ddi_get_driver_private(dip); 1830 1831 if (ena == NULL) { 1832 return (DDI_FAILURE); 1833 } 1834 1835 /* 1836 * Before we can proceed to cleanup we have to treat 1837 * mac_unregister() explicitly -- if there are still 1838 * outstanding clients, then we can't proceed with detach or 1839 * cleanup. 1840 */ 1841 1842 /* 1843 * Why this would fail I don't know, but if we proceed to mac 1844 * unregister, then there is a good chance we will panic in 1845 * the Rx interrupt handler when calling mac_rx_ring() 1846 */ 1847 if (!ena_intrs_disable(ena)) { 1848 return (DDI_FAILURE); 1849 } 1850 1851 /* We can't detach if clients are actively using the device. */ 1852 if (ena_mac_unregister(ena) != 0) { 1853 (void) ena_intrs_enable(ena); 1854 return (DDI_FAILURE); 1855 } 1856 1857 /* 1858 * At this point we can proceed with the rest of cleanup on a 1859 * best-effort basis. 1860 */ 1861 ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC; 1862 ena_cleanup(ena); 1863 ddi_set_driver_private(dip, NULL); 1864 kmem_free(ena, sizeof (ena_t)); 1865 return (DDI_SUCCESS); 1866 } 1867 1868 static struct cb_ops ena_cb_ops = { 1869 .cb_open = nodev, 1870 .cb_close = nodev, 1871 .cb_strategy = nodev, 1872 .cb_print = nodev, 1873 .cb_dump = nodev, 1874 .cb_read = nodev, 1875 .cb_write = nodev, 1876 .cb_ioctl = nodev, 1877 .cb_devmap = nodev, 1878 .cb_mmap = nodev, 1879 .cb_segmap = nodev, 1880 .cb_chpoll = nochpoll, 1881 .cb_prop_op = ddi_prop_op, 1882 .cb_flag = D_MP, 1883 .cb_rev = CB_REV, 1884 .cb_aread = nodev, 1885 .cb_awrite = nodev 1886 }; 1887 1888 static struct dev_ops ena_dev_ops = { 1889 .devo_rev = DEVO_REV, 1890 .devo_refcnt = 0, 1891 .devo_getinfo = NULL, 1892 .devo_identify = nulldev, 1893 .devo_probe = nulldev, 1894 .devo_attach = ena_attach, 1895 .devo_detach = ena_detach, 1896 .devo_reset = nodev, 1897 .devo_quiesce = ddi_quiesce_not_supported, 1898 .devo_cb_ops = &ena_cb_ops 1899 }; 1900 1901 static struct modldrv ena_modldrv = { 1902 .drv_modops = &mod_driverops, 1903 .drv_linkinfo = "AWS ENA Ethernet", 1904 .drv_dev_ops = &ena_dev_ops 1905 }; 1906 1907 static struct modlinkage ena_modlinkage = { 1908 .ml_rev = MODREV_1, 1909 .ml_linkage = { &ena_modldrv, NULL } 1910 }; 1911 1912 int 1913 _init(void) 1914 { 1915 int ret; 1916 1917 mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME); 1918 1919 if ((ret = mod_install(&ena_modlinkage)) != 0) { 1920 mac_fini_ops(&ena_dev_ops); 1921 return (ret); 1922 } 1923 1924 return (ret); 1925 } 1926 1927 int 1928 _info(struct modinfo *modinfop) 1929 { 1930 return (mod_info(&ena_modlinkage, modinfop)); 1931 } 1932 1933 int 1934 _fini(void) 1935 { 1936 int ret; 1937 1938 if ((ret = mod_remove(&ena_modlinkage)) != 0) { 1939 return (ret); 1940 } 1941 1942 mac_fini_ops(&ena_dev_ops); 1943 return (ret); 1944 } 1945