1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2023-2024 Google LLC 5 * 6 * Redistribution and use in source and binary forms, with or without modification, 7 * are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * 3. Neither the name of the copyright holder nor the names of its contributors 17 * may be used to endorse or promote products derived from this software without 18 * specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 #include "gve.h" 32 #include "gve_adminq.h" 33 #include "gve_dqo.h" 34 35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 36 37 static int 38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) 39 { 40 struct gve_queue_page_list *qpl = tx->com.qpl; 41 struct gve_tx_fifo *fifo = &tx->fifo; 42 43 fifo->size = qpl->num_pages * PAGE_SIZE; 44 fifo->base = qpl->kva; 45 atomic_store_int(&fifo->available, fifo->size); 46 fifo->head = 0; 47 48 return (0); 49 } 50 51 static void 52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i) 53 { 54 struct gve_tx_ring *tx = &priv->tx[i]; 55 struct gve_ring_com *com = &tx->com; 56 57 if (tx->desc_ring != NULL) { 58 gve_dma_free_coherent(&tx->desc_ring_mem); 59 tx->desc_ring = NULL; 60 } 61 62 if (tx->info != NULL) { 63 free(tx->info, M_GVE); 64 tx->info = NULL; 65 } 66 67 if (com->qpl != NULL) { 68 gve_free_qpl(priv, com->qpl); 69 com->qpl = NULL; 70 } 71 } 72 73 static void 74 gve_tx_free_ring(struct gve_priv *priv, int i) 75 { 76 struct gve_tx_ring *tx = &priv->tx[i]; 77 struct gve_ring_com *com = &tx->com; 78 79 /* Safe to call even if never alloced */ 80 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 81 82 if (mtx_initialized(&tx->ring_mtx)) 83 mtx_destroy(&tx->ring_mtx); 84 85 if (com->q_resources != NULL) { 86 gve_dma_free_coherent(&com->q_resources_mem); 87 com->q_resources = NULL; 88 } 89 90 if (tx->br != NULL) { 91 buf_ring_free(tx->br, M_DEVBUF); 92 tx->br = NULL; 93 } 94 95 if (gve_is_gqi(priv)) 96 gve_tx_free_ring_gqi(priv, i); 97 else 98 gve_tx_free_ring_dqo(priv, i); 99 } 100 101 static int 102 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) 103 { 104 struct gve_tx_ring *tx = &priv->tx[i]; 105 struct gve_ring_com *com = &tx->com; 106 int err; 107 108 err = gve_dma_alloc_coherent(priv, 109 sizeof(union gve_tx_desc) * priv->tx_desc_cnt, 110 CACHE_LINE_SIZE, &tx->desc_ring_mem); 111 if (err != 0) { 112 device_printf(priv->dev, 113 "Failed to alloc desc ring for tx ring %d", i); 114 goto abort; 115 } 116 tx->desc_ring = tx->desc_ring_mem.cpu_addr; 117 118 com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, 119 /*single_kva=*/true); 120 if (com->qpl == NULL) { 121 device_printf(priv->dev, 122 "Failed to alloc QPL for tx ring %d\n", i); 123 err = ENOMEM; 124 goto abort; 125 } 126 127 err = gve_tx_fifo_init(priv, tx); 128 if (err != 0) 129 goto abort; 130 131 tx->info = malloc( 132 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, 133 M_GVE, M_WAITOK | M_ZERO); 134 return (0); 135 136 abort: 137 gve_tx_free_ring_gqi(priv, i); 138 return (err); 139 } 140 141 static int 142 gve_tx_alloc_ring(struct gve_priv *priv, int i) 143 { 144 struct gve_tx_ring *tx = &priv->tx[i]; 145 struct gve_ring_com *com = &tx->com; 146 char mtx_name[16]; 147 int err; 148 149 com->priv = priv; 150 com->id = i; 151 152 if (gve_is_gqi(priv)) 153 err = gve_tx_alloc_ring_gqi(priv, i); 154 else 155 err = gve_tx_alloc_ring_dqo(priv, i); 156 if (err != 0) 157 goto abort; 158 159 sprintf(mtx_name, "gvetx%d", i); 160 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); 161 162 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, 163 M_WAITOK, &tx->ring_mtx); 164 165 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 166 167 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), 168 PAGE_SIZE, &com->q_resources_mem); 169 if (err != 0) { 170 device_printf(priv->dev, 171 "Failed to alloc queue resources for tx ring %d", i); 172 goto abort; 173 } 174 com->q_resources = com->q_resources_mem.cpu_addr; 175 176 tx->last_kicked = 0; 177 178 return (0); 179 180 abort: 181 gve_tx_free_ring(priv, i); 182 return (err); 183 } 184 185 int 186 gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) 187 { 188 int i; 189 int err; 190 191 KASSERT(priv->tx != NULL, ("priv->tx is NULL!")); 192 193 for (i = start_idx; i < stop_idx; i++) { 194 err = gve_tx_alloc_ring(priv, i); 195 if (err != 0) 196 goto free_rings; 197 } 198 199 return (0); 200 free_rings: 201 gve_free_tx_rings(priv, start_idx, i); 202 return (err); 203 } 204 205 void 206 gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) 207 { 208 int i; 209 210 for (i = start_idx; i < stop_idx; i++) 211 gve_tx_free_ring(priv, i); 212 } 213 214 static void 215 gve_tx_clear_desc_ring(struct gve_tx_ring *tx) 216 { 217 struct gve_ring_com *com = &tx->com; 218 int i; 219 220 for (i = 0; i < com->priv->tx_desc_cnt; i++) { 221 tx->desc_ring[i] = (union gve_tx_desc){}; 222 tx->info[i] = (struct gve_tx_buffer_state){}; 223 gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec); 224 } 225 226 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 227 BUS_DMASYNC_PREWRITE); 228 } 229 230 static void 231 gve_clear_tx_ring(struct gve_priv *priv, int i) 232 { 233 struct gve_tx_ring *tx = &priv->tx[i]; 234 struct gve_tx_fifo *fifo = &tx->fifo; 235 236 tx->req = 0; 237 tx->done = 0; 238 tx->mask = priv->tx_desc_cnt - 1; 239 240 atomic_store_int(&fifo->available, fifo->size); 241 fifo->head = 0; 242 243 gve_tx_clear_desc_ring(tx); 244 } 245 246 static void 247 gve_start_tx_ring(struct gve_priv *priv, int i) 248 { 249 struct gve_tx_ring *tx = &priv->tx[i]; 250 struct gve_ring_com *com = &tx->com; 251 252 atomic_store_bool(&tx->stopped, false); 253 if (gve_is_gqi(priv)) 254 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); 255 else 256 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); 257 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, 258 taskqueue_thread_enqueue, &com->cleanup_tq); 259 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", 260 device_get_nameunit(priv->dev), i); 261 262 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); 263 tx->xmit_tq = taskqueue_create_fast("gve tx xmit", 264 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); 265 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", 266 device_get_nameunit(priv->dev), i); 267 } 268 269 int 270 gve_create_tx_rings(struct gve_priv *priv) 271 { 272 struct gve_ring_com *com; 273 struct gve_tx_ring *tx; 274 int err; 275 int i; 276 277 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) 278 return (0); 279 280 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 281 if (gve_is_gqi(priv)) 282 gve_clear_tx_ring(priv, i); 283 else 284 gve_clear_tx_ring_dqo(priv, i); 285 } 286 287 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); 288 if (err != 0) 289 return (err); 290 291 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, 292 BUS_DMASYNC_POSTREAD); 293 294 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 295 tx = &priv->tx[i]; 296 com = &tx->com; 297 298 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); 299 300 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, 301 BUS_DMASYNC_POSTREAD); 302 com->db_offset = 4 * be32toh(com->q_resources->db_index); 303 com->counter_idx = be32toh(com->q_resources->counter_index); 304 305 gve_start_tx_ring(priv, i); 306 } 307 308 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 309 return (0); 310 } 311 312 static void 313 gve_stop_tx_ring(struct gve_priv *priv, int i) 314 { 315 struct gve_tx_ring *tx = &priv->tx[i]; 316 struct gve_ring_com *com = &tx->com; 317 318 if (com->cleanup_tq != NULL) { 319 taskqueue_quiesce(com->cleanup_tq); 320 taskqueue_free(com->cleanup_tq); 321 com->cleanup_tq = NULL; 322 } 323 324 if (tx->xmit_tq != NULL) { 325 taskqueue_quiesce(tx->xmit_tq); 326 taskqueue_free(tx->xmit_tq); 327 tx->xmit_tq = NULL; 328 } 329 } 330 331 int 332 gve_destroy_tx_rings(struct gve_priv *priv) 333 { 334 int err; 335 int i; 336 337 for (i = 0; i < priv->tx_cfg.num_queues; i++) 338 gve_stop_tx_ring(priv, i); 339 340 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { 341 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); 342 if (err != 0) 343 return (err); 344 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 345 } 346 347 return (0); 348 } 349 350 int 351 gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx) 352 { 353 struct gve_tx_buffer_state *info; 354 uint32_t pkt_idx; 355 int num_timeouts; 356 357 num_timeouts = 0; 358 359 for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) { 360 info = &tx->info[pkt_idx]; 361 362 if (!gve_timestamp_valid(&info->enqueue_time_sec)) 363 continue; 364 365 if (__predict_false( 366 gve_seconds_since(&info->enqueue_time_sec) > 367 GVE_TX_TIMEOUT_PKT_SEC)) 368 num_timeouts += 1; 369 } 370 371 return (num_timeouts); 372 } 373 374 int 375 gve_tx_intr(void *arg) 376 { 377 struct gve_tx_ring *tx = arg; 378 struct gve_priv *priv = tx->com.priv; 379 struct gve_ring_com *com = &tx->com; 380 381 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 382 return (FILTER_STRAY); 383 384 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); 385 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); 386 return (FILTER_HANDLED); 387 } 388 389 static uint32_t 390 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) 391 { 392 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, 393 BUS_DMASYNC_POSTREAD); 394 uint32_t counter = priv->counters[tx->com.counter_idx]; 395 return (be32toh(counter)); 396 } 397 398 static void 399 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) 400 { 401 atomic_add_int(&fifo->available, bytes); 402 } 403 404 void 405 gve_tx_cleanup_tq(void *arg, int pending) 406 { 407 struct gve_tx_ring *tx = arg; 408 struct gve_priv *priv = tx->com.priv; 409 uint32_t nic_done = gve_tx_load_event_counter(priv, tx); 410 uint32_t todo = nic_done - tx->done; 411 size_t space_freed = 0; 412 int i, j; 413 414 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 415 return; 416 417 for (j = 0; j < todo; j++) { 418 uint32_t idx = tx->done & tx->mask; 419 struct gve_tx_buffer_state *info = &tx->info[idx]; 420 struct mbuf *mbuf = info->mbuf; 421 422 tx->done++; 423 if (mbuf == NULL) 424 continue; 425 426 gve_invalidate_timestamp(&info->enqueue_time_sec); 427 428 info->mbuf = NULL; 429 430 counter_enter(); 431 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); 432 counter_u64_add_protected(tx->stats.tpackets, 1); 433 counter_exit(); 434 m_freem(mbuf); 435 436 for (i = 0; i < GVE_TX_MAX_DESCS; i++) { 437 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; 438 info->iov[i].iov_len = 0; 439 info->iov[i].iov_padding = 0; 440 } 441 } 442 443 gve_tx_free_fifo(&tx->fifo, space_freed); 444 445 gve_db_bar_write_4(priv, tx->com.irq_db_offset, 446 GVE_IRQ_ACK | GVE_IRQ_EVENT); 447 448 /* 449 * Completions born before this barrier MAY NOT cause the NIC to send an 450 * interrupt but they will still be handled by the enqueue below. 451 * Completions born after the barrier WILL trigger an interrupt. 452 */ 453 atomic_thread_fence_seq_cst(); 454 455 nic_done = gve_tx_load_event_counter(priv, tx); 456 todo = nic_done - tx->done; 457 if (todo != 0) { 458 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); 459 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); 460 } 461 462 if (atomic_load_bool(&tx->stopped) && space_freed) { 463 atomic_store_bool(&tx->stopped, false); 464 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 465 } 466 } 467 468 static void 469 gve_dma_sync_for_device(struct gve_queue_page_list *qpl, 470 uint64_t iov_offset, uint64_t iov_len) 471 { 472 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; 473 uint64_t first_page = iov_offset / PAGE_SIZE; 474 struct gve_dma_handle *dma; 475 uint64_t page; 476 477 for (page = first_page; page <= last_page; page++) { 478 dma = &(qpl->dmas[page]); 479 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); 480 } 481 } 482 483 static void 484 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) 485 { 486 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; 487 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; 488 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); 489 mtd_desc->reserved0 = 0; 490 mtd_desc->reserved1 = 0; 491 } 492 493 static void 494 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, 495 uint16_t l4_hdr_offset, uint32_t desc_cnt, 496 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, 497 int csum_offset, uint16_t pkt_len) 498 { 499 if (is_tso) { 500 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; 501 pkt_desc->l4_csum_offset = csum_offset >> 1; 502 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 503 } else if (has_csum_flag) { 504 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; 505 pkt_desc->l4_csum_offset = csum_offset >> 1; 506 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 507 } else { 508 pkt_desc->type_flags = GVE_TXD_STD; 509 pkt_desc->l4_csum_offset = 0; 510 pkt_desc->l4_hdr_offset = 0; 511 } 512 pkt_desc->desc_cnt = desc_cnt; 513 pkt_desc->len = htobe16(pkt_len); 514 pkt_desc->seg_len = htobe16(first_seg_len); 515 pkt_desc->seg_addr = htobe64(addr); 516 } 517 518 static void 519 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, 520 bool is_tso, uint16_t len, uint64_t addr, 521 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) 522 { 523 seg_desc->type_flags = GVE_TXD_SEG; 524 if (is_tso) { 525 if (is_ipv6) 526 seg_desc->type_flags |= GVE_TXSF_IPV6; 527 seg_desc->l3_offset = l3_off >> 1; 528 seg_desc->mss = htobe16(tso_mss); 529 } 530 seg_desc->seg_len = htobe16(len); 531 seg_desc->seg_addr = htobe64(addr); 532 } 533 534 static inline uint32_t 535 gve_tx_avail(struct gve_tx_ring *tx) 536 { 537 return (tx->mask + 1 - (tx->req - tx->done)); 538 } 539 540 static bool 541 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) 542 { 543 return (atomic_load_int(&fifo->available) >= bytes); 544 } 545 546 static inline bool 547 gve_can_tx(struct gve_tx_ring *tx, int bytes_required) 548 { 549 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && 550 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); 551 } 552 553 static int 554 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) 555 { 556 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; 557 } 558 559 static inline int 560 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, 561 uint16_t pkt_len) 562 { 563 int pad_bytes, align_hdr_pad; 564 int bytes; 565 566 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 567 /* We need to take into account the header alignment padding. */ 568 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; 569 bytes = align_hdr_pad + pad_bytes + pkt_len; 570 571 return (bytes); 572 } 573 574 static int 575 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, 576 struct gve_tx_iovec iov[2]) 577 { 578 size_t overflow, padding; 579 uint32_t aligned_head; 580 int nfrags = 0; 581 582 if (bytes == 0) 583 return (0); 584 585 /* 586 * This check happens before we know how much padding is needed to 587 * align to a cacheline boundary for the payload, but that is fine, 588 * because the FIFO head always start aligned, and the FIFO's boundaries 589 * are aligned, so if there is space for the data, there is space for 590 * the padding to the next alignment. 591 */ 592 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), 593 ("Allocating gve tx fifo when there is no room")); 594 595 nfrags++; 596 597 iov[0].iov_offset = fifo->head; 598 iov[0].iov_len = bytes; 599 fifo->head += bytes; 600 601 if (fifo->head > fifo->size) { 602 /* 603 * If the allocation did not fit in the tail fragment of the 604 * FIFO, also use the head fragment. 605 */ 606 nfrags++; 607 overflow = fifo->head - fifo->size; 608 iov[0].iov_len -= overflow; 609 iov[1].iov_offset = 0; /* Start of fifo*/ 610 iov[1].iov_len = overflow; 611 612 fifo->head = overflow; 613 } 614 615 /* Re-align to a cacheline boundary */ 616 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); 617 padding = aligned_head - fifo->head; 618 iov[nfrags - 1].iov_padding = padding; 619 atomic_add_int(&fifo->available, -(bytes + padding)); 620 fifo->head = aligned_head; 621 622 if (fifo->head == fifo->size) 623 fifo->head = 0; 624 625 return (nfrags); 626 } 627 628 /* Only error this returns is ENOBUFS when the tx fifo is short of space */ 629 static int 630 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) 631 { 632 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; 633 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; 634 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; 635 int pad_bytes, hdr_nfrags, payload_nfrags; 636 struct gve_tx_pkt_desc *pkt_desc; 637 struct gve_tx_seg_desc *seg_desc; 638 struct gve_tx_mtd_desc *mtd_desc; 639 struct gve_tx_buffer_state *info; 640 uint32_t idx = tx->req & tx->mask; 641 struct ether_header *eh; 642 struct mbuf *mbuf_next; 643 int payload_iov = 2; 644 int bytes_required; 645 struct ip6_hdr *ip6; 646 struct tcphdr *th; 647 uint32_t next_idx; 648 uint8_t l3_off; 649 struct ip *ip; 650 int i; 651 652 info = &tx->info[idx]; 653 csum_flags = mbuf->m_pkthdr.csum_flags; 654 pkt_len = mbuf->m_pkthdr.len; 655 is_tso = csum_flags & CSUM_TSO; 656 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | 657 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); 658 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; 659 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; 660 661 eh = mtod(mbuf, struct ether_header *); 662 KASSERT(eh->ether_type != ETHERTYPE_VLAN, 663 ("VLAN-tagged packets not supported")); 664 665 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; 666 l3_off = ETHER_HDR_LEN; 667 mbuf_next = m_getptr(mbuf, l3_off, &offset); 668 669 if (is_ipv6) { 670 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); 671 l4_off = l3_off + sizeof(struct ip6_hdr); 672 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); 673 is_udp = (ip6->ip6_nxt == IPPROTO_UDP); 674 mbuf_next = m_getptr(mbuf, l4_off, &offset); 675 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { 676 ip = (struct ip *)(mtodo(mbuf_next, offset)); 677 l4_off = l3_off + (ip->ip_hl << 2); 678 is_tcp = (ip->ip_p == IPPROTO_TCP); 679 is_udp = (ip->ip_p == IPPROTO_UDP); 680 mbuf_next = m_getptr(mbuf, l4_off, &offset); 681 } 682 683 l4_data_off = 0; 684 if (is_tcp) { 685 th = (struct tcphdr *)(mtodo(mbuf_next, offset)); 686 l4_data_off = l4_off + (th->th_off << 2); 687 } else if (is_udp) 688 l4_data_off = l4_off + sizeof(struct udphdr); 689 690 if (has_csum_flag) { 691 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) 692 csum_offset = offsetof(struct tcphdr, th_sum); 693 else 694 csum_offset = offsetof(struct udphdr, uh_sum); 695 } 696 697 /* 698 * If this packet is neither a TCP nor a UDP packet, the first segment, 699 * the one represented by the packet descriptor, will carry the 700 * spec-stipulated minimum of 182B. 701 */ 702 if (l4_data_off != 0) 703 first_seg_len = l4_data_off; 704 else 705 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); 706 707 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); 708 if (__predict_false(!gve_can_tx(tx, bytes_required))) { 709 counter_enter(); 710 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); 711 counter_exit(); 712 return (ENOBUFS); 713 } 714 715 /* So that the cleanup taskqueue can free the mbuf eventually. */ 716 info->mbuf = mbuf; 717 718 gve_set_timestamp(&info->enqueue_time_sec); 719 720 /* 721 * We don't want to split the header, so if necessary, pad to the end 722 * of the fifo and then put the header at the beginning of the fifo. 723 */ 724 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 725 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, 726 &info->iov[0]); 727 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); 728 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, 729 &info->iov[payload_iov]); 730 731 pkt_desc = &tx->desc_ring[idx].pkt; 732 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 733 1 + mtd_desc_nr + payload_nfrags, first_seg_len, 734 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, 735 pkt_len); 736 737 m_copydata(mbuf, 0, first_seg_len, 738 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); 739 gve_dma_sync_for_device(tx->com.qpl, 740 info->iov[hdr_nfrags - 1].iov_offset, 741 info->iov[hdr_nfrags - 1].iov_len); 742 copy_offset = first_seg_len; 743 744 if (mtd_desc_nr == 1) { 745 next_idx = (tx->req + 1) & tx->mask; 746 mtd_desc = &tx->desc_ring[next_idx].mtd; 747 gve_tx_fill_mtd_desc(mtd_desc, mbuf); 748 } 749 750 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { 751 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; 752 seg_desc = &tx->desc_ring[next_idx].seg; 753 754 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, 755 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); 756 757 m_copydata(mbuf, copy_offset, info->iov[i].iov_len, 758 (char *)tx->fifo.base + info->iov[i].iov_offset); 759 gve_dma_sync_for_device(tx->com.qpl, 760 info->iov[i].iov_offset, info->iov[i].iov_len); 761 copy_offset += info->iov[i].iov_len; 762 } 763 764 tx->req += (1 + mtd_desc_nr + payload_nfrags); 765 if (is_tso) { 766 counter_enter(); 767 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); 768 counter_exit(); 769 } 770 return (0); 771 } 772 773 static int 774 gve_xmit_mbuf(struct gve_tx_ring *tx, 775 struct mbuf **mbuf) 776 { 777 if (gve_is_gqi(tx->com.priv)) 778 return (gve_xmit(tx, *mbuf)); 779 780 if (gve_is_qpl(tx->com.priv)) 781 return (gve_xmit_dqo_qpl(tx, *mbuf)); 782 783 /* 784 * gve_xmit_dqo might attempt to defrag the mbuf chain. 785 * The reference is passed in so that in the case of 786 * errors, the new mbuf chain is what's put back on the br. 787 */ 788 return (gve_xmit_dqo(tx, mbuf)); 789 } 790 791 /* 792 * Has the side-effect of stopping the xmit queue by setting tx->stopped 793 */ 794 static int 795 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, 796 struct mbuf **mbuf) 797 { 798 int err; 799 800 atomic_store_bool(&tx->stopped, true); 801 802 /* 803 * Room made in the queue BEFORE the barrier will be seen by the 804 * gve_xmit_mbuf retry below. 805 * 806 * If room is made in the queue AFTER the barrier, the cleanup tq 807 * iteration creating the room will either see a tx->stopped value 808 * of 0 or the 1 we just wrote: 809 * 810 * If it sees a 1, then it would enqueue the xmit tq. Enqueue 811 * implies a retry on the waiting pkt. 812 * 813 * If it sees a 0, then that implies a previous iteration overwrote 814 * our 1, and that iteration would enqueue the xmit tq. Enqueue 815 * implies a retry on the waiting pkt. 816 */ 817 atomic_thread_fence_seq_cst(); 818 819 err = gve_xmit_mbuf(tx, mbuf); 820 if (err == 0) 821 atomic_store_bool(&tx->stopped, false); 822 823 return (err); 824 } 825 826 static void 827 gve_xmit_br(struct gve_tx_ring *tx) 828 { 829 struct gve_priv *priv = tx->com.priv; 830 struct ifnet *ifp = priv->ifp; 831 struct mbuf *mbuf; 832 int err; 833 834 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && 835 (mbuf = drbr_peek(ifp, tx->br)) != NULL) { 836 err = gve_xmit_mbuf(tx, &mbuf); 837 838 /* 839 * We need to stop this taskqueue when we can't xmit the pkt due 840 * to lack of space in the NIC ring (ENOBUFS). The retry exists 841 * to guard against a TOCTTOU bug that could end up freezing the 842 * queue forever. 843 */ 844 if (__predict_false(mbuf != NULL && err == ENOBUFS)) 845 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); 846 847 if (__predict_false(err != 0 && mbuf != NULL)) { 848 if (err == EINVAL) { 849 drbr_advance(ifp, tx->br); 850 m_freem(mbuf); 851 } else 852 drbr_putback(ifp, tx->br, mbuf); 853 break; 854 } 855 856 drbr_advance(ifp, tx->br); 857 BPF_MTAP(ifp, mbuf); 858 859 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 860 BUS_DMASYNC_PREWRITE); 861 862 if (gve_is_gqi(priv)) 863 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); 864 else 865 gve_db_bar_dqo_write_4(priv, tx->com.db_offset, 866 tx->dqo.desc_tail); 867 } 868 } 869 870 void 871 gve_xmit_tq(void *arg, int pending) 872 { 873 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; 874 875 GVE_RING_LOCK(tx); 876 gve_xmit_br(tx); 877 GVE_RING_UNLOCK(tx); 878 } 879 880 static bool 881 is_vlan_tagged_pkt(struct mbuf *mbuf) 882 { 883 struct ether_header *eh; 884 885 eh = mtod(mbuf, struct ether_header *); 886 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); 887 } 888 889 int 890 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) 891 { 892 struct gve_priv *priv = if_getsoftc(ifp); 893 struct gve_tx_ring *tx; 894 bool is_br_empty; 895 int err; 896 uint32_t i; 897 898 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 899 return (ENODEV); 900 901 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) 902 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; 903 else 904 i = curcpu % priv->tx_cfg.num_queues; 905 tx = &priv->tx[i]; 906 907 if (__predict_false(is_vlan_tagged_pkt(mbuf))) { 908 counter_enter(); 909 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); 910 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 911 counter_exit(); 912 m_freem(mbuf); 913 return (ENODEV); 914 } 915 916 is_br_empty = drbr_empty(ifp, tx->br); 917 err = drbr_enqueue(ifp, tx->br, mbuf); 918 if (__predict_false(err != 0)) { 919 if (!atomic_load_bool(&tx->stopped)) 920 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 921 counter_enter(); 922 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); 923 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 924 counter_exit(); 925 return (err); 926 } 927 928 /* 929 * If the mbuf we just enqueued is the only one on the ring, then 930 * transmit it right away in the interests of low latency. 931 */ 932 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { 933 gve_xmit_br(tx); 934 GVE_RING_UNLOCK(tx); 935 } else if (!atomic_load_bool(&tx->stopped)) 936 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 937 938 return (0); 939 } 940 941 void 942 gve_qflush(if_t ifp) 943 { 944 struct gve_priv *priv = if_getsoftc(ifp); 945 struct gve_tx_ring *tx; 946 int i; 947 948 for (i = 0; i < priv->tx_cfg.num_queues; ++i) { 949 tx = &priv->tx[i]; 950 if (drbr_empty(ifp, tx->br) == 0) { 951 GVE_RING_LOCK(tx); 952 drbr_flush(ifp, tx->br); 953 GVE_RING_UNLOCK(tx); 954 } 955 } 956 957 if_qflush(ifp); 958 } 959