1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 #include "ena.h" 16 17 static void 18 ena_refill_rx(ena_rxq_t *rxq, uint16_t num) 19 { 20 VERIFY3P(rxq, !=, NULL); 21 ASSERT(MUTEX_HELD(&rxq->er_lock)); 22 ASSERT3U(num, <=, rxq->er_sq_num_descs); 23 uint16_t tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1); 24 25 while (num != 0) { 26 enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod]; 27 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod]; 28 uint16_t phase = rxq->er_sq_phase; 29 30 VERIFY3U(tail_mod, <, rxq->er_sq_num_descs); 31 VERIFY3P(desc, !=, NULL); 32 VERIFY3P(rcb, !=, NULL); 33 VERIFY3P(desc, >=, rxq->er_sq_descs); 34 VERIFY3P(desc, <=, 35 (rxq->er_sq_descs + rxq->er_sq_num_descs - 1)); 36 37 desc->erd_length = rcb->ercb_dma.edb_len; 38 desc->erd_req_id = tail_mod; 39 VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL); 40 ena_set_dma_addr_values(rxq->er_ena, 41 rcb->ercb_dma.edb_cookie->dmac_laddress, 42 &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi); 43 44 ENAHW_RX_DESC_CLEAR_CTRL(desc); 45 ENAHW_RX_DESC_SET_PHASE(desc, phase); 46 ENAHW_RX_DESC_SET_FIRST(desc); 47 ENAHW_RX_DESC_SET_LAST(desc); 48 ENAHW_RX_DESC_SET_COMP_REQ(desc); 49 DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc); 50 rxq->er_sq_tail_idx++; 51 tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1); 52 53 if (tail_mod == 0) { 54 rxq->er_sq_phase ^= 1; 55 } 56 57 num--; 58 } 59 60 ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV); 61 ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr, 62 rxq->er_sq_tail_idx); 63 } 64 65 void 66 ena_free_rx_dma(ena_rxq_t *rxq) 67 { 68 if (rxq->er_rcbs != NULL) { 69 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { 70 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; 71 ena_dma_free(&rcb->ercb_dma); 72 } 73 74 kmem_free(rxq->er_rcbs, 75 sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs); 76 77 rxq->er_rcbs = NULL; 78 } 79 80 ena_dma_free(&rxq->er_cq_dma); 81 rxq->er_cq_descs = NULL; 82 rxq->er_cq_num_descs = 0; 83 84 ena_dma_free(&rxq->er_sq_dma); 85 rxq->er_sq_descs = NULL; 86 rxq->er_sq_num_descs = 0; 87 88 rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC; 89 } 90 91 static int 92 ena_alloc_rx_dma(ena_rxq_t *rxq) 93 { 94 ena_t *ena = rxq->er_ena; 95 size_t cq_descs_sz; 96 size_t sq_descs_sz; 97 ena_dma_conf_t conf; 98 int err = 0; 99 100 cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs); 101 sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs); 102 /* BEGIN CSTYLED */ 103 conf = (ena_dma_conf_t) { 104 .edc_size = sq_descs_sz, 105 .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT, 106 .edc_sgl = 1, 107 .edc_endian = DDI_NEVERSWAP_ACC, 108 .edc_stream = B_FALSE, 109 }; 110 /* END CSTYLED */ 111 112 if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &conf, sq_descs_sz)) { 113 return (ENOMEM); 114 } 115 116 rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va; 117 rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) * 118 rxq->er_sq_num_descs, KM_SLEEP); 119 120 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { 121 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; 122 ena_dma_conf_t buf_conf = { 123 .edc_size = ena->ena_rx_buf_sz, 124 .edc_align = 1, 125 .edc_sgl = ena->ena_rx_sgl_max_sz, 126 .edc_endian = DDI_NEVERSWAP_ACC, 127 .edc_stream = B_TRUE, 128 }; 129 130 if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf, 131 ena->ena_rx_buf_sz)) { 132 err = ENOMEM; 133 goto error; 134 } 135 } 136 137 /* BEGIN CSTYLED */ 138 conf = (ena_dma_conf_t) { 139 .edc_size = cq_descs_sz, 140 .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT, 141 .edc_sgl = 1, 142 .edc_endian = DDI_NEVERSWAP_ACC, 143 .edc_stream = B_FALSE, 144 }; 145 /* END CSTYLED */ 146 147 if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &conf, cq_descs_sz)) { 148 err = ENOMEM; 149 goto error; 150 } 151 152 rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va; 153 rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC; 154 return (0); 155 156 error: 157 ena_free_rx_dma(rxq); 158 return (err); 159 } 160 161 boolean_t 162 ena_alloc_rxq(ena_rxq_t *rxq) 163 { 164 int ret = 0; 165 ena_t *ena = rxq->er_ena; 166 uint16_t cq_hw_idx, sq_hw_idx; 167 uint32_t *cq_unmask_addr, *cq_numanode; 168 uint32_t *sq_db_addr; 169 170 /* 171 * First, allocate the Rx data buffers. 172 */ 173 if ((ret = ena_alloc_rx_dma(rxq)) != 0) { 174 ena_err(ena, "failed to allocate Rx queue %u data buffers: %d", 175 rxq->er_rxqs_idx, ret); 176 return (B_FALSE); 177 } 178 179 ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC); 180 181 /* 182 * Second, create the Completion Queue. 183 */ 184 ret = ena_create_cq(ena, rxq->er_cq_num_descs, 185 rxq->er_cq_dma.edb_cookie->dmac_laddress, B_FALSE, 186 rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_numanode); 187 188 if (ret != 0) { 189 ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx, 190 ret); 191 return (B_FALSE); 192 } 193 194 /* The phase must always start on 1. */ 195 rxq->er_cq_phase = 1; 196 rxq->er_cq_head_idx = 0; 197 rxq->er_cq_hw_idx = cq_hw_idx; 198 rxq->er_cq_unmask_addr = cq_unmask_addr; 199 rxq->er_cq_numa_addr = cq_numanode; 200 rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED; 201 202 /* 203 * Third, create the Submission Queue to match with the above 204 * CQ. At this time we force the SQ and CQ to have the same 205 * number of descriptors as we only use a 1:1 completion 206 * policy. However, in the future, we could loosen this and 207 * use an on-demand completion policy and the two could have a 208 * different number of descriptors. 209 */ 210 ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs); 211 ret = ena_create_sq(ena, rxq->er_sq_num_descs, 212 rxq->er_sq_dma.edb_cookie->dmac_laddress, B_FALSE, cq_hw_idx, 213 &sq_hw_idx, &sq_db_addr); 214 215 if (ret != 0) { 216 ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx, 217 ret); 218 return (B_FALSE); 219 } 220 221 ASSERT3P(sq_db_addr, !=, NULL); 222 rxq->er_sq_hw_idx = sq_hw_idx; 223 rxq->er_sq_db_addr = sq_db_addr; 224 /* The phase must always start on 1. */ 225 rxq->er_sq_phase = 1; 226 rxq->er_sq_tail_idx = 0; 227 rxq->er_sq_avail_descs = rxq->er_sq_num_descs; 228 rxq->er_mode = ENA_RXQ_MODE_INTR; 229 rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED; 230 231 return (B_TRUE); 232 } 233 234 void 235 ena_cleanup_rxq(ena_rxq_t *rxq) 236 { 237 int ret = 0; 238 ena_t *ena = rxq->er_ena; 239 240 if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) { 241 ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, B_FALSE); 242 243 if (ret != 0) { 244 ena_err(ena, "failed to destroy Rx SQ %u: %d", 245 rxq->er_rxqs_idx, ret); 246 } 247 248 rxq->er_sq_hw_idx = 0; 249 rxq->er_sq_db_addr = NULL; 250 rxq->er_sq_tail_idx = 0; 251 rxq->er_sq_phase = 0; 252 rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED; 253 rxq->er_state &= ~ENA_RXQ_STATE_SQ_FILLED; 254 } 255 256 if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) { 257 ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx); 258 259 if (ret != 0) { 260 ena_err(ena, "failed to destroy Rx CQ %u: %d", 261 rxq->er_rxqs_idx, ret); 262 } 263 264 rxq->er_cq_hw_idx = 0; 265 rxq->er_cq_head_idx = 0; 266 rxq->er_cq_phase = 0; 267 rxq->er_cq_unmask_addr = NULL; 268 rxq->er_cq_numa_addr = NULL; 269 rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED; 270 } 271 272 ena_free_rx_dma(rxq); 273 ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE); 274 } 275 276 void 277 ena_ring_rx_stop(mac_ring_driver_t rh) 278 { 279 ena_rxq_t *rxq = (ena_rxq_t *)rh; 280 uint32_t intr_ctrl; 281 282 intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr); 283 ENAHW_REG_INTR_MASK(intr_ctrl); 284 ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl); 285 286 rxq->er_state &= ~ENA_RXQ_STATE_RUNNING; 287 rxq->er_state &= ~ENA_RXQ_STATE_READY; 288 } 289 290 int 291 ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num) 292 { 293 ena_rxq_t *rxq = (ena_rxq_t *)rh; 294 ena_t *ena = rxq->er_ena; 295 uint32_t intr_ctrl; 296 297 ena_dbg(ena, "ring_rx_start %p: state %x", rxq, rxq->er_state); 298 299 mutex_enter(&rxq->er_lock); 300 if ((rxq->er_state & ENA_RXQ_STATE_SQ_FILLED) == 0) { 301 /* 302 * The ENA controller gets upset and sets the fatal error bit 303 * in its status register if we write a value to an RX SQ's 304 * doorbell that is past its current head. This makes sense as 305 * it would represent there being more descriptors available 306 * than can fit in the ring. For this reason, we make sure that 307 * we only fill the ring once, even if it is started multiple 308 * times. 309 * The `- 1` below is harder to explain. If we completely fill 310 * the SQ ring, then at some time later that seems to be 311 * independent of how many times we've been around the ring, 312 * the ENA controller will set the fatal error bit and stop 313 * responding. Leaving a gap prevents this somehow and it is 314 * what the other open source drivers do. 315 */ 316 ena_refill_rx(rxq, rxq->er_sq_num_descs - 1); 317 rxq->er_state |= ENA_RXQ_STATE_SQ_FILLED; 318 } 319 rxq->er_m_gen_num = gen_num; 320 rxq->er_intr_limit = ena->ena_rxq_intr_limit; 321 mutex_exit(&rxq->er_lock); 322 323 rxq->er_state |= ENA_RXQ_STATE_READY; 324 325 intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr); 326 ENAHW_REG_INTR_UNMASK(intr_ctrl); 327 ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl); 328 rxq->er_state |= ENA_RXQ_STATE_RUNNING; 329 return (0); 330 } 331 332 mblk_t * 333 ena_ring_rx(ena_rxq_t *rxq, int poll_bytes) 334 { 335 ena_t *ena = rxq->er_ena; 336 uint16_t head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1); 337 uint64_t total_bytes = 0; 338 uint64_t num_frames = 0; 339 enahw_rx_cdesc_t *cdesc; 340 boolean_t polling = B_TRUE; 341 mblk_t *head = NULL; 342 mblk_t *tail = NULL; 343 344 ASSERT(MUTEX_HELD(&rxq->er_lock)); 345 ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL); 346 347 if (poll_bytes == ENA_INTERRUPT_MODE) { 348 polling = B_FALSE; 349 } 350 351 cdesc = &rxq->er_cq_descs[head_mod]; 352 VERIFY3P(cdesc, >=, rxq->er_cq_descs); 353 VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); 354 355 while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) { 356 boolean_t first, last; 357 ena_rx_ctrl_block_t *rcb; 358 uint16_t req_id; 359 mblk_t *mp; 360 enahw_io_l3_proto_t l3proto; 361 enahw_io_l4_proto_t l4proto; 362 boolean_t l4csum_checked; 363 uint32_t hflags = 0; 364 365 VERIFY3U(head_mod, <, rxq->er_cq_num_descs); 366 /* 367 * Currently, all incoming frames fit in a single Rx 368 * buffer (erd_length > total frame size). In the 369 * future, if we decide to loan buffers which are 370 * smaller, we will need to modify this code to read 371 * one or more descriptors (based on frame size). 372 * 373 * For this reason we do not expect any frame to span 374 * multiple descriptors. Therefore, we drop any data 375 * not delivered as a single descriptor, i.e., where 376 * 'first' and 'last' are both true. 377 */ 378 first = ENAHW_RX_CDESC_FIRST(cdesc); 379 last = ENAHW_RX_CDESC_LAST(cdesc); 380 381 if (!first || !last) { 382 mutex_enter(&rxq->er_stat_lock); 383 rxq->er_stat.ers_multi_desc.value.ui64++; 384 mutex_exit(&rxq->er_stat_lock); 385 goto next_desc; 386 } 387 388 req_id = cdesc->erc_req_id; 389 VERIFY3U(req_id, <, rxq->er_cq_num_descs); 390 rcb = &rxq->er_rcbs[req_id]; 391 rcb->ercb_offset = cdesc->erc_offset; 392 rcb->ercb_length = cdesc->erc_length; 393 ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total); 394 mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0); 395 396 /* 397 * If we can't allocate an mblk, things are looking 398 * grim. Forget about this frame and move on. 399 */ 400 if (mp == NULL) { 401 mutex_enter(&rxq->er_stat_lock); 402 rxq->er_stat.ers_allocb_fail.value.ui64++; 403 mutex_exit(&rxq->er_stat_lock); 404 goto next_desc; 405 } 406 407 /* 408 * As we pull frames we need to link them together as 409 * one chain to be delivered up to mac. 410 */ 411 if (head == NULL) { 412 head = mp; 413 } else { 414 tail->b_next = mp; 415 } 416 417 tail = mp; 418 419 /* 420 * We need to make sure the bytes are copied to the 421 * correct offset to achieve 4-byte IP header 422 * alignment. 423 * 424 * If we start using desballoc on the buffers, then we 425 * will need to make sure to apply this offset to the 426 * DMA buffers as well. Though it may be the case the 427 * device does this implicitly and that's what 428 * cdesc->erc_offset is for; we don't know because 429 * it's not documented. 430 */ 431 mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT; 432 mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT; 433 bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr, 434 rcb->ercb_length); 435 mp->b_wptr += rcb->ercb_length; 436 total_bytes += rcb->ercb_length; 437 VERIFY3P(mp->b_wptr, >, mp->b_rptr); 438 VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim); 439 440 l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc); 441 l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc); 442 443 /* 444 * When it comes to bad TCP/IP checksums we do not 445 * discard the packet at this level. Instead, we let 446 * it percolate up for further processing and tracking 447 * by the upstream TCP/IP stack. 448 */ 449 if (ena->ena_rx_l3_ipv4_csum && 450 l3proto == ENAHW_IO_L3_PROTO_IPV4) { 451 boolean_t l3_csum_err = 452 ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc); 453 454 if (l3_csum_err) { 455 mutex_enter(&rxq->er_stat_lock); 456 rxq->er_stat.ers_hck_ipv4_err.value.ui64++; 457 mutex_exit(&rxq->er_stat_lock); 458 } else { 459 hflags |= HCK_IPV4_HDRCKSUM_OK; 460 } 461 } 462 463 l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc); 464 465 if (ena->ena_rx_l4_ipv4_csum && l4csum_checked && 466 l4proto == ENAHW_IO_L4_PROTO_TCP) { 467 boolean_t l4_csum_err = 468 ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc); 469 470 if (l4_csum_err) { 471 mutex_enter(&rxq->er_stat_lock); 472 rxq->er_stat.ers_hck_l4_err.value.ui64++; 473 mutex_exit(&rxq->er_stat_lock); 474 } else { 475 hflags |= HCK_FULLCKSUM_OK; 476 } 477 } 478 479 if (hflags != 0) { 480 mac_hcksum_set(mp, 0, 0, 0, 0, hflags); 481 } 482 483 next_desc: 484 /* 485 * Technically, if we arrived here due to a failure, 486 * then we did not read a new frame. However, we count 487 * it all the same anyways in order to count it as 488 * progress to the interrupt work limit. The failure 489 * stats will allow us to differentiate good frames 490 * from bad. 491 */ 492 num_frames++; 493 rxq->er_cq_head_idx++; 494 head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1); 495 496 if (head_mod == 0) { 497 rxq->er_cq_phase ^= 1; 498 } 499 500 if (polling && (total_bytes > poll_bytes)) { 501 break; 502 } else if (!polling && (num_frames >= rxq->er_intr_limit)) { 503 mutex_enter(&rxq->er_stat_lock); 504 rxq->er_stat.ers_intr_limit.value.ui64++; 505 mutex_exit(&rxq->er_stat_lock); 506 break; 507 } 508 509 cdesc = &rxq->er_cq_descs[head_mod]; 510 VERIFY3P(cdesc, >=, rxq->er_cq_descs); 511 VERIFY3P(cdesc, <=, 512 (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); 513 } 514 515 if (num_frames > 0) { 516 mutex_enter(&rxq->er_stat_lock); 517 rxq->er_stat.ers_packets.value.ui64 += num_frames; 518 rxq->er_stat.ers_bytes.value.ui64 += total_bytes; 519 mutex_exit(&rxq->er_stat_lock); 520 521 DTRACE_PROBE4(rx__frames, mblk_t *, head, boolean_t, polling, 522 uint64_t, num_frames, uint64_t, total_bytes); 523 ena_refill_rx(rxq, num_frames); 524 } 525 526 return (head); 527 } 528 529 void 530 ena_rx_intr_work(ena_rxq_t *rxq) 531 { 532 mblk_t *mp; 533 534 mutex_enter(&rxq->er_lock); 535 mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE); 536 mutex_exit(&rxq->er_lock); 537 538 if (mp == NULL) { 539 return; 540 } 541 542 mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num); 543 } 544 545 mblk_t * 546 ena_ring_rx_poll(void *rh, int poll_bytes) 547 { 548 ena_rxq_t *rxq = rh; 549 mblk_t *mp; 550 551 ASSERT3S(poll_bytes, >, 0); 552 553 mutex_enter(&rxq->er_lock); 554 mp = ena_ring_rx(rxq, poll_bytes); 555 mutex_exit(&rxq->er_lock); 556 557 return (mp); 558 } 559