1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 #include "ena.h" 17 18 static void 19 ena_refill_rx(ena_rxq_t *rxq, uint16_t num) 20 { 21 VERIFY3P(rxq, !=, NULL); 22 ASSERT(MUTEX_HELD(&rxq->er_lock)); 23 ASSERT3U(num, <=, rxq->er_sq_num_descs); 24 25 const uint16_t modulo_mask = rxq->er_sq_num_descs - 1; 26 uint16_t tail_mod = rxq->er_sq_tail_idx & modulo_mask; 27 28 while (num != 0) { 29 enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod]; 30 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod]; 31 uint16_t phase = rxq->er_sq_phase; 32 33 VERIFY3U(tail_mod, <, rxq->er_sq_num_descs); 34 VERIFY3P(desc, !=, NULL); 35 VERIFY3P(rcb, !=, NULL); 36 VERIFY3P(desc, >=, rxq->er_sq_descs); 37 VERIFY3P(desc, <=, 38 (rxq->er_sq_descs + rxq->er_sq_num_descs - 1)); 39 40 desc->erd_length = rcb->ercb_dma.edb_len; 41 desc->erd_req_id = tail_mod; 42 VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL); 43 ena_set_dma_addr_values(rxq->er_ena, 44 rcb->ercb_dma.edb_cookie->dmac_laddress, 45 &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi); 46 47 ENAHW_RX_DESC_CLEAR_CTRL(desc); 48 ENAHW_RX_DESC_SET_PHASE(desc, phase); 49 ENAHW_RX_DESC_SET_FIRST(desc); 50 ENAHW_RX_DESC_SET_LAST(desc); 51 ENAHW_RX_DESC_SET_COMP_REQ(desc); 52 DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc); 53 rxq->er_sq_tail_idx++; 54 tail_mod = rxq->er_sq_tail_idx & modulo_mask; 55 56 if (tail_mod == 0) 57 rxq->er_sq_phase ^= 1; 58 59 num--; 60 } 61 62 ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV); 63 ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr, 64 rxq->er_sq_tail_idx); 65 } 66 67 void 68 ena_free_rx_dma(ena_rxq_t *rxq) 69 { 70 if (rxq->er_rcbs != NULL) { 71 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { 72 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; 73 ena_dma_free(&rcb->ercb_dma); 74 } 75 76 kmem_free(rxq->er_rcbs, 77 sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs); 78 79 rxq->er_rcbs = NULL; 80 } 81 82 ena_dma_free(&rxq->er_cq_dma); 83 rxq->er_cq_descs = NULL; 84 rxq->er_cq_num_descs = 0; 85 86 ena_dma_free(&rxq->er_sq_dma); 87 rxq->er_sq_descs = NULL; 88 rxq->er_sq_num_descs = 0; 89 90 rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC; 91 } 92 93 static int 94 ena_alloc_rx_dma(ena_rxq_t *rxq) 95 { 96 ena_t *ena = rxq->er_ena; 97 size_t cq_descs_sz; 98 size_t sq_descs_sz; 99 int err = 0; 100 101 cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs); 102 sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs); 103 104 ena_dma_conf_t sq_conf = { 105 .edc_size = sq_descs_sz, 106 .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT, 107 .edc_sgl = 1, 108 .edc_endian = DDI_NEVERSWAP_ACC, 109 .edc_stream = false, 110 }; 111 112 if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &sq_conf, sq_descs_sz)) { 113 return (ENOMEM); 114 } 115 116 rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va; 117 rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) * 118 rxq->er_sq_num_descs, KM_SLEEP); 119 120 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { 121 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; 122 ena_dma_conf_t buf_conf = { 123 .edc_size = ena->ena_rx_buf_sz, 124 .edc_align = 1, 125 .edc_sgl = ena->ena_rx_sgl_max_sz, 126 .edc_endian = DDI_NEVERSWAP_ACC, 127 .edc_stream = true, 128 }; 129 130 if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf, 131 ena->ena_rx_buf_sz)) { 132 err = ENOMEM; 133 goto error; 134 } 135 } 136 137 ena_dma_conf_t cq_conf = { 138 .edc_size = cq_descs_sz, 139 .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT, 140 .edc_sgl = 1, 141 .edc_endian = DDI_NEVERSWAP_ACC, 142 .edc_stream = false, 143 }; 144 145 if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &cq_conf, cq_descs_sz)) { 146 err = ENOMEM; 147 goto error; 148 } 149 150 rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va; 151 rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC; 152 return (0); 153 154 error: 155 ena_free_rx_dma(rxq); 156 return (err); 157 } 158 159 bool 160 ena_alloc_rxq(ena_rxq_t *rxq) 161 { 162 int ret = 0; 163 ena_t *ena = rxq->er_ena; 164 uint16_t cq_hw_idx, sq_hw_idx; 165 uint32_t *cq_unmask_addr, *cq_numanode; 166 uint32_t *sq_db_addr; 167 168 /* 169 * First, allocate the Rx data buffers. 170 */ 171 if ((ret = ena_alloc_rx_dma(rxq)) != 0) { 172 ena_err(ena, "failed to allocate Rx queue %u data buffers: %d", 173 rxq->er_rxqs_idx, ret); 174 return (false); 175 } 176 177 ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC); 178 179 /* 180 * Second, create the Completion Queue. 181 */ 182 ret = ena_create_cq(ena, rxq->er_cq_num_descs, 183 rxq->er_cq_dma.edb_cookie->dmac_laddress, false, 184 rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_numanode); 185 186 if (ret != 0) { 187 ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx, 188 ret); 189 return (false); 190 } 191 192 /* The phase must always start on 1. */ 193 rxq->er_cq_phase = 1; 194 rxq->er_cq_head_idx = 0; 195 rxq->er_cq_hw_idx = cq_hw_idx; 196 rxq->er_cq_unmask_addr = cq_unmask_addr; 197 rxq->er_cq_numa_addr = cq_numanode; 198 rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED; 199 200 /* 201 * Third, create the Submission Queue to match with the above 202 * CQ. At this time we force the SQ and CQ to have the same 203 * number of descriptors as we only use a 1:1 completion 204 * policy. However, in the future, we could loosen this and 205 * use an on-demand completion policy and the two could have a 206 * different number of descriptors. 207 */ 208 ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs); 209 ret = ena_create_sq(ena, rxq->er_sq_num_descs, 210 rxq->er_sq_dma.edb_cookie->dmac_laddress, false, cq_hw_idx, 211 &sq_hw_idx, &sq_db_addr); 212 213 if (ret != 0) { 214 ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx, 215 ret); 216 return (false); 217 } 218 219 ASSERT3P(sq_db_addr, !=, NULL); 220 rxq->er_sq_hw_idx = sq_hw_idx; 221 rxq->er_sq_db_addr = sq_db_addr; 222 /* The phase must always start on 1. */ 223 rxq->er_sq_phase = 1; 224 rxq->er_sq_tail_idx = 0; 225 rxq->er_sq_avail_descs = rxq->er_sq_num_descs; 226 rxq->er_mode = ENA_RXQ_MODE_INTR; 227 rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED; 228 229 return (true); 230 } 231 232 void 233 ena_cleanup_rxq(ena_rxq_t *rxq, bool resetting) 234 { 235 int ret = 0; 236 ena_t *ena = rxq->er_ena; 237 238 if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) { 239 if (!resetting) { 240 ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, false); 241 242 if (ret != 0) { 243 ena_err(ena, "failed to destroy Rx SQ %u: %d", 244 rxq->er_rxqs_idx, ret); 245 } 246 } 247 248 rxq->er_sq_hw_idx = 0; 249 rxq->er_sq_db_addr = NULL; 250 rxq->er_sq_tail_idx = 0; 251 rxq->er_sq_phase = 0; 252 rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED; 253 rxq->er_state &= ~ENA_RXQ_STATE_SQ_FILLED; 254 } 255 256 if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) { 257 if (!resetting) { 258 ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx); 259 260 if (ret != 0) { 261 ena_err(ena, "failed to destroy Rx CQ %u: %d", 262 rxq->er_rxqs_idx, ret); 263 } 264 } 265 266 rxq->er_cq_hw_idx = 0; 267 rxq->er_cq_head_idx = 0; 268 rxq->er_cq_phase = 0; 269 rxq->er_cq_unmask_addr = NULL; 270 rxq->er_cq_numa_addr = NULL; 271 rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED; 272 } 273 274 ena_free_rx_dma(rxq); 275 ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE); 276 } 277 278 void 279 ena_ring_rx_stop(mac_ring_driver_t rh) 280 { 281 ena_rxq_t *rxq = (ena_rxq_t *)rh; 282 uint32_t intr_ctrl; 283 284 intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr); 285 ENAHW_REG_INTR_MASK(intr_ctrl); 286 ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl); 287 288 rxq->er_state &= ~ENA_RXQ_STATE_RUNNING; 289 rxq->er_state &= ~ENA_RXQ_STATE_READY; 290 } 291 292 int 293 ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num) 294 { 295 ena_rxq_t *rxq = (ena_rxq_t *)rh; 296 ena_t *ena = rxq->er_ena; 297 uint32_t intr_ctrl; 298 299 ena_dbg(ena, "ring_rx_start %p: state 0x%x", rxq, rxq->er_state); 300 301 mutex_enter(&rxq->er_lock); 302 if ((rxq->er_state & ENA_RXQ_STATE_SQ_FILLED) == 0) { 303 /* 304 * The ENA controller gets upset and sets the fatal error bit 305 * in its status register if we write a value to an RX SQ's 306 * doorbell that is past its current head. This makes sense as 307 * it would represent there being more descriptors available 308 * than can fit in the ring. For this reason, we make sure that 309 * we only fill the ring once, even if it is started multiple 310 * times. 311 * The `- 1` below is harder to explain. If we completely fill 312 * the SQ ring, then at some time later that seems to be 313 * independent of how many times we've been around the ring, 314 * the ENA controller will set the fatal error bit and stop 315 * responding. Leaving a gap prevents this somehow and it is 316 * what the other open source drivers do. 317 */ 318 ena_refill_rx(rxq, rxq->er_sq_num_descs - 1); 319 rxq->er_state |= ENA_RXQ_STATE_SQ_FILLED; 320 } 321 rxq->er_m_gen_num = gen_num; 322 rxq->er_intr_limit = ena->ena_rxq_intr_limit; 323 mutex_exit(&rxq->er_lock); 324 325 rxq->er_state |= ENA_RXQ_STATE_READY; 326 327 intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr); 328 ENAHW_REG_INTR_UNMASK(intr_ctrl); 329 ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl); 330 rxq->er_state |= ENA_RXQ_STATE_RUNNING; 331 return (0); 332 } 333 334 mblk_t * 335 ena_ring_rx(ena_rxq_t *rxq, int poll_bytes) 336 { 337 ena_t *ena = rxq->er_ena; 338 const uint16_t modulo_mask = rxq->er_cq_num_descs - 1; 339 uint16_t head_mod = rxq->er_cq_head_idx & modulo_mask; 340 uint64_t total_bytes = 0; 341 uint64_t num_frames = 0; 342 enahw_rx_cdesc_t *cdesc; 343 bool polling = true; 344 mblk_t *head = NULL; 345 mblk_t *tail = NULL; 346 347 ASSERT(MUTEX_HELD(&rxq->er_lock)); 348 ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL); 349 350 if (poll_bytes == ENA_INTERRUPT_MODE) { 351 polling = false; 352 } 353 354 cdesc = &rxq->er_cq_descs[head_mod]; 355 VERIFY3P(cdesc, >=, rxq->er_cq_descs); 356 VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); 357 358 while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) { 359 bool first, last; 360 ena_rx_ctrl_block_t *rcb; 361 uint16_t req_id; 362 mblk_t *mp; 363 enahw_io_l3_proto_t l3proto; 364 enahw_io_l4_proto_t l4proto; 365 bool l4csum_checked; 366 uint32_t hflags = 0; 367 368 VERIFY3U(head_mod, <, rxq->er_cq_num_descs); 369 /* 370 * Currently, all incoming frames fit in a single Rx 371 * buffer (erd_length > total frame size). In the 372 * future, if we decide to loan buffers which are 373 * smaller, we will need to modify this code to read 374 * one or more descriptors (based on frame size). 375 * 376 * For this reason we do not expect any frame to span 377 * multiple descriptors. Therefore, we drop any data 378 * not delivered as a single descriptor, i.e., where 379 * 'first' and 'last' are both true. 380 */ 381 first = ENAHW_RX_CDESC_FIRST(cdesc); 382 last = ENAHW_RX_CDESC_LAST(cdesc); 383 384 if (!first || !last) { 385 mutex_enter(&rxq->er_stat_lock); 386 rxq->er_stat.ers_multi_desc.value.ui64++; 387 mutex_exit(&rxq->er_stat_lock); 388 goto next_desc; 389 } 390 391 req_id = cdesc->erc_req_id; 392 VERIFY3U(req_id, <, rxq->er_cq_num_descs); 393 rcb = &rxq->er_rcbs[req_id]; 394 rcb->ercb_offset = cdesc->erc_offset; 395 rcb->ercb_length = cdesc->erc_length; 396 ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total); 397 mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0); 398 399 /* 400 * If we can't allocate an mblk, things are looking 401 * grim. Forget about this frame and move on. 402 */ 403 if (mp == NULL) { 404 mutex_enter(&rxq->er_stat_lock); 405 rxq->er_stat.ers_allocb_fail.value.ui64++; 406 mutex_exit(&rxq->er_stat_lock); 407 goto next_desc; 408 } 409 410 /* 411 * As we pull frames we need to link them together as 412 * one chain to be delivered up to mac. 413 */ 414 if (head == NULL) { 415 head = mp; 416 } else { 417 tail->b_next = mp; 418 } 419 420 tail = mp; 421 422 /* 423 * We need to make sure the bytes are copied to the 424 * correct offset to achieve 4-byte IP header 425 * alignment. 426 * 427 * If we start using desballoc on the buffers, then we 428 * will need to make sure to apply this offset to the 429 * DMA buffers as well. Though it may be the case the 430 * device does this implicitly and that's what 431 * cdesc->erc_offset is for; we don't know because 432 * it's not documented. 433 */ 434 mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT; 435 mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT; 436 bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr, 437 rcb->ercb_length); 438 mp->b_wptr += rcb->ercb_length; 439 total_bytes += rcb->ercb_length; 440 VERIFY3P(mp->b_wptr, >, mp->b_rptr); 441 VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim); 442 443 l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc); 444 l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc); 445 446 /* 447 * When it comes to bad TCP/IP checksums we do not 448 * discard the packet at this level. Instead, we let 449 * it percolate up for further processing and tracking 450 * by the upstream TCP/IP stack. 451 */ 452 if (ena->ena_rx_l3_ipv4_csum && 453 l3proto == ENAHW_IO_L3_PROTO_IPV4) { 454 bool l3_csum_err = 455 ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc); 456 457 if (l3_csum_err) { 458 mutex_enter(&rxq->er_stat_lock); 459 rxq->er_stat.ers_hck_ipv4_err.value.ui64++; 460 mutex_exit(&rxq->er_stat_lock); 461 } else { 462 hflags |= HCK_IPV4_HDRCKSUM_OK; 463 } 464 } 465 466 l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc); 467 468 if (ena->ena_rx_l4_ipv4_csum && l4csum_checked && 469 l4proto == ENAHW_IO_L4_PROTO_TCP) { 470 bool l4_csum_err = 471 ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc); 472 473 if (l4_csum_err) { 474 mutex_enter(&rxq->er_stat_lock); 475 rxq->er_stat.ers_hck_l4_err.value.ui64++; 476 mutex_exit(&rxq->er_stat_lock); 477 } else { 478 hflags |= HCK_FULLCKSUM_OK; 479 } 480 } 481 482 if (hflags != 0) { 483 mac_hcksum_set(mp, 0, 0, 0, 0, hflags); 484 } 485 486 next_desc: 487 /* 488 * Technically, if we arrived here due to a failure, 489 * then we did not read a new frame. However, we count 490 * it all the same anyways in order to count it as 491 * progress to the interrupt work limit. The failure 492 * stats will allow us to differentiate good frames 493 * from bad. 494 */ 495 num_frames++; 496 rxq->er_cq_head_idx++; 497 head_mod = rxq->er_cq_head_idx & modulo_mask; 498 if (head_mod == 0) 499 rxq->er_cq_phase ^= 1; 500 501 if (polling && total_bytes > poll_bytes) { 502 break; 503 } else if (!polling && num_frames >= rxq->er_intr_limit) { 504 mutex_enter(&rxq->er_stat_lock); 505 rxq->er_stat.ers_intr_limit.value.ui64++; 506 mutex_exit(&rxq->er_stat_lock); 507 break; 508 } 509 510 cdesc = &rxq->er_cq_descs[head_mod]; 511 VERIFY3P(cdesc, >=, rxq->er_cq_descs); 512 VERIFY3P(cdesc, <=, 513 (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); 514 } 515 516 if (num_frames > 0) { 517 mutex_enter(&rxq->er_stat_lock); 518 rxq->er_stat.ers_packets.value.ui64 += num_frames; 519 rxq->er_stat.ers_bytes.value.ui64 += total_bytes; 520 mutex_exit(&rxq->er_stat_lock); 521 522 DTRACE_PROBE5(rx__frames, ena_rxq_t *, rxq, mblk_t *, head, 523 bool, polling, uint64_t, num_frames, uint64_t, total_bytes); 524 ena_refill_rx(rxq, num_frames); 525 } 526 527 return (head); 528 } 529 530 void 531 ena_rx_intr_work(ena_rxq_t *rxq) 532 { 533 mblk_t *mp; 534 535 mutex_enter(&rxq->er_lock); 536 mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE); 537 mutex_exit(&rxq->er_lock); 538 539 if (mp == NULL) { 540 return; 541 } 542 543 mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num); 544 } 545 546 mblk_t * 547 ena_ring_rx_poll(void *rh, int poll_bytes) 548 { 549 ena_rxq_t *rxq = rh; 550 mblk_t *mp; 551 552 ASSERT3S(poll_bytes, >, 0); 553 554 mutex_enter(&rxq->er_lock); 555 mp = ena_ring_rx(rxq, poll_bytes); 556 mutex_exit(&rxq->er_lock); 557 558 return (mp); 559 } 560