1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2023-2024 Google LLC 5 * 6 * Redistribution and use in source and binary forms, with or without modification, 7 * are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * 3. Neither the name of the copyright holder nor the names of its contributors 17 * may be used to endorse or promote products derived from this software without 18 * specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 #include "gve.h" 32 #include "gve_adminq.h" 33 #include "gve_dqo.h" 34 35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 36 37 static int 38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) 39 { 40 struct gve_queue_page_list *qpl = tx->com.qpl; 41 struct gve_tx_fifo *fifo = &tx->fifo; 42 43 fifo->size = qpl->num_pages * PAGE_SIZE; 44 fifo->base = qpl->kva; 45 atomic_store_int(&fifo->available, fifo->size); 46 fifo->head = 0; 47 48 return (0); 49 } 50 51 static void 52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i) 53 { 54 struct gve_tx_ring *tx = &priv->tx[i]; 55 struct gve_ring_com *com = &tx->com; 56 57 if (tx->desc_ring != NULL) { 58 gve_dma_free_coherent(&tx->desc_ring_mem); 59 tx->desc_ring = NULL; 60 } 61 62 if (tx->info != NULL) { 63 free(tx->info, M_GVE); 64 tx->info = NULL; 65 } 66 67 if (com->qpl != NULL) { 68 gve_free_qpl(priv, com->qpl); 69 com->qpl = NULL; 70 } 71 } 72 73 static void 74 gve_tx_free_ring(struct gve_priv *priv, int i) 75 { 76 struct gve_tx_ring *tx = &priv->tx[i]; 77 struct gve_ring_com *com = &tx->com; 78 79 /* Safe to call even if never alloced */ 80 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 81 82 if (mtx_initialized(&tx->ring_mtx)) 83 mtx_destroy(&tx->ring_mtx); 84 85 if (com->q_resources != NULL) { 86 gve_dma_free_coherent(&com->q_resources_mem); 87 com->q_resources = NULL; 88 } 89 90 if (tx->br != NULL) { 91 buf_ring_free(tx->br, M_DEVBUF); 92 tx->br = NULL; 93 } 94 95 if (gve_is_gqi(priv)) 96 gve_tx_free_ring_gqi(priv, i); 97 else 98 gve_tx_free_ring_dqo(priv, i); 99 } 100 101 static int 102 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) 103 { 104 struct gve_tx_ring *tx = &priv->tx[i]; 105 struct gve_ring_com *com = &tx->com; 106 int err; 107 108 err = gve_dma_alloc_coherent(priv, 109 sizeof(union gve_tx_desc) * priv->tx_desc_cnt, 110 CACHE_LINE_SIZE, &tx->desc_ring_mem); 111 if (err != 0) { 112 device_printf(priv->dev, 113 "Failed to alloc desc ring for tx ring %d", i); 114 goto abort; 115 } 116 tx->desc_ring = tx->desc_ring_mem.cpu_addr; 117 118 com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, 119 /*single_kva=*/true); 120 if (com->qpl == NULL) { 121 device_printf(priv->dev, 122 "Failed to alloc QPL for tx ring %d\n", i); 123 err = ENOMEM; 124 goto abort; 125 } 126 127 err = gve_tx_fifo_init(priv, tx); 128 if (err != 0) 129 goto abort; 130 131 tx->info = malloc( 132 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, 133 M_GVE, M_WAITOK | M_ZERO); 134 return (0); 135 136 abort: 137 gve_tx_free_ring_gqi(priv, i); 138 return (err); 139 } 140 141 static int 142 gve_tx_alloc_ring(struct gve_priv *priv, int i) 143 { 144 struct gve_tx_ring *tx = &priv->tx[i]; 145 struct gve_ring_com *com = &tx->com; 146 char mtx_name[16]; 147 int err; 148 149 com->priv = priv; 150 com->id = i; 151 152 if (gve_is_gqi(priv)) 153 err = gve_tx_alloc_ring_gqi(priv, i); 154 else 155 err = gve_tx_alloc_ring_dqo(priv, i); 156 if (err != 0) 157 goto abort; 158 159 sprintf(mtx_name, "gvetx%d", i); 160 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); 161 162 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, 163 M_WAITOK, &tx->ring_mtx); 164 165 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 166 167 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), 168 PAGE_SIZE, &com->q_resources_mem); 169 if (err != 0) { 170 device_printf(priv->dev, 171 "Failed to alloc queue resources for tx ring %d", i); 172 goto abort; 173 } 174 com->q_resources = com->q_resources_mem.cpu_addr; 175 176 return (0); 177 178 abort: 179 gve_tx_free_ring(priv, i); 180 return (err); 181 } 182 183 int 184 gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) 185 { 186 int i; 187 int err; 188 189 KASSERT(priv->tx != NULL, ("priv->tx is NULL!")); 190 191 for (i = start_idx; i < stop_idx; i++) { 192 err = gve_tx_alloc_ring(priv, i); 193 if (err != 0) 194 goto free_rings; 195 } 196 197 return (0); 198 free_rings: 199 gve_free_tx_rings(priv, start_idx, i); 200 return (err); 201 } 202 203 void 204 gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) 205 { 206 int i; 207 208 for (i = start_idx; i < stop_idx; i++) 209 gve_tx_free_ring(priv, i); 210 } 211 212 static void 213 gve_tx_clear_desc_ring(struct gve_tx_ring *tx) 214 { 215 struct gve_ring_com *com = &tx->com; 216 int i; 217 218 for (i = 0; i < com->priv->tx_desc_cnt; i++) { 219 tx->desc_ring[i] = (union gve_tx_desc){}; 220 tx->info[i] = (struct gve_tx_buffer_state){}; 221 } 222 223 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 224 BUS_DMASYNC_PREWRITE); 225 } 226 227 static void 228 gve_clear_tx_ring(struct gve_priv *priv, int i) 229 { 230 struct gve_tx_ring *tx = &priv->tx[i]; 231 struct gve_tx_fifo *fifo = &tx->fifo; 232 233 tx->req = 0; 234 tx->done = 0; 235 tx->mask = priv->tx_desc_cnt - 1; 236 237 atomic_store_int(&fifo->available, fifo->size); 238 fifo->head = 0; 239 240 gve_tx_clear_desc_ring(tx); 241 } 242 243 static void 244 gve_start_tx_ring(struct gve_priv *priv, int i) 245 { 246 struct gve_tx_ring *tx = &priv->tx[i]; 247 struct gve_ring_com *com = &tx->com; 248 249 atomic_store_bool(&tx->stopped, false); 250 if (gve_is_gqi(priv)) 251 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); 252 else 253 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); 254 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, 255 taskqueue_thread_enqueue, &com->cleanup_tq); 256 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", 257 device_get_nameunit(priv->dev), i); 258 259 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); 260 tx->xmit_tq = taskqueue_create_fast("gve tx xmit", 261 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); 262 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", 263 device_get_nameunit(priv->dev), i); 264 } 265 266 int 267 gve_create_tx_rings(struct gve_priv *priv) 268 { 269 struct gve_ring_com *com; 270 struct gve_tx_ring *tx; 271 int err; 272 int i; 273 274 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) 275 return (0); 276 277 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 278 if (gve_is_gqi(priv)) 279 gve_clear_tx_ring(priv, i); 280 else 281 gve_clear_tx_ring_dqo(priv, i); 282 } 283 284 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); 285 if (err != 0) 286 return (err); 287 288 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, 289 BUS_DMASYNC_POSTREAD); 290 291 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 292 tx = &priv->tx[i]; 293 com = &tx->com; 294 295 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); 296 297 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, 298 BUS_DMASYNC_POSTREAD); 299 com->db_offset = 4 * be32toh(com->q_resources->db_index); 300 com->counter_idx = be32toh(com->q_resources->counter_index); 301 302 gve_start_tx_ring(priv, i); 303 } 304 305 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 306 return (0); 307 } 308 309 static void 310 gve_stop_tx_ring(struct gve_priv *priv, int i) 311 { 312 struct gve_tx_ring *tx = &priv->tx[i]; 313 struct gve_ring_com *com = &tx->com; 314 315 if (com->cleanup_tq != NULL) { 316 taskqueue_quiesce(com->cleanup_tq); 317 taskqueue_free(com->cleanup_tq); 318 com->cleanup_tq = NULL; 319 } 320 321 if (tx->xmit_tq != NULL) { 322 taskqueue_quiesce(tx->xmit_tq); 323 taskqueue_free(tx->xmit_tq); 324 tx->xmit_tq = NULL; 325 } 326 } 327 328 int 329 gve_destroy_tx_rings(struct gve_priv *priv) 330 { 331 int err; 332 int i; 333 334 for (i = 0; i < priv->tx_cfg.num_queues; i++) 335 gve_stop_tx_ring(priv, i); 336 337 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { 338 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); 339 if (err != 0) 340 return (err); 341 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 342 } 343 344 return (0); 345 } 346 347 int 348 gve_tx_intr(void *arg) 349 { 350 struct gve_tx_ring *tx = arg; 351 struct gve_priv *priv = tx->com.priv; 352 struct gve_ring_com *com = &tx->com; 353 354 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 355 return (FILTER_STRAY); 356 357 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); 358 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); 359 return (FILTER_HANDLED); 360 } 361 362 static uint32_t 363 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) 364 { 365 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, 366 BUS_DMASYNC_POSTREAD); 367 uint32_t counter = priv->counters[tx->com.counter_idx]; 368 return (be32toh(counter)); 369 } 370 371 static void 372 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) 373 { 374 atomic_add_int(&fifo->available, bytes); 375 } 376 377 void 378 gve_tx_cleanup_tq(void *arg, int pending) 379 { 380 struct gve_tx_ring *tx = arg; 381 struct gve_priv *priv = tx->com.priv; 382 uint32_t nic_done = gve_tx_load_event_counter(priv, tx); 383 uint32_t todo = nic_done - tx->done; 384 size_t space_freed = 0; 385 int i, j; 386 387 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 388 return; 389 390 for (j = 0; j < todo; j++) { 391 uint32_t idx = tx->done & tx->mask; 392 struct gve_tx_buffer_state *info = &tx->info[idx]; 393 struct mbuf *mbuf = info->mbuf; 394 395 tx->done++; 396 if (mbuf == NULL) 397 continue; 398 399 info->mbuf = NULL; 400 counter_enter(); 401 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); 402 counter_u64_add_protected(tx->stats.tpackets, 1); 403 counter_exit(); 404 m_freem(mbuf); 405 406 for (i = 0; i < GVE_TX_MAX_DESCS; i++) { 407 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; 408 info->iov[i].iov_len = 0; 409 info->iov[i].iov_padding = 0; 410 } 411 } 412 413 gve_tx_free_fifo(&tx->fifo, space_freed); 414 415 gve_db_bar_write_4(priv, tx->com.irq_db_offset, 416 GVE_IRQ_ACK | GVE_IRQ_EVENT); 417 418 /* 419 * Completions born before this barrier MAY NOT cause the NIC to send an 420 * interrupt but they will still be handled by the enqueue below. 421 * Completions born after the barrier WILL trigger an interrupt. 422 */ 423 atomic_thread_fence_seq_cst(); 424 425 nic_done = gve_tx_load_event_counter(priv, tx); 426 todo = nic_done - tx->done; 427 if (todo != 0) { 428 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); 429 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); 430 } 431 432 if (atomic_load_bool(&tx->stopped) && space_freed) { 433 atomic_store_bool(&tx->stopped, false); 434 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 435 } 436 } 437 438 static void 439 gve_dma_sync_for_device(struct gve_queue_page_list *qpl, 440 uint64_t iov_offset, uint64_t iov_len) 441 { 442 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; 443 uint64_t first_page = iov_offset / PAGE_SIZE; 444 struct gve_dma_handle *dma; 445 uint64_t page; 446 447 for (page = first_page; page <= last_page; page++) { 448 dma = &(qpl->dmas[page]); 449 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); 450 } 451 } 452 453 static void 454 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) 455 { 456 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; 457 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; 458 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); 459 mtd_desc->reserved0 = 0; 460 mtd_desc->reserved1 = 0; 461 } 462 463 static void 464 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, 465 uint16_t l4_hdr_offset, uint32_t desc_cnt, 466 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, 467 int csum_offset, uint16_t pkt_len) 468 { 469 if (is_tso) { 470 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; 471 pkt_desc->l4_csum_offset = csum_offset >> 1; 472 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 473 } else if (has_csum_flag) { 474 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; 475 pkt_desc->l4_csum_offset = csum_offset >> 1; 476 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 477 } else { 478 pkt_desc->type_flags = GVE_TXD_STD; 479 pkt_desc->l4_csum_offset = 0; 480 pkt_desc->l4_hdr_offset = 0; 481 } 482 pkt_desc->desc_cnt = desc_cnt; 483 pkt_desc->len = htobe16(pkt_len); 484 pkt_desc->seg_len = htobe16(first_seg_len); 485 pkt_desc->seg_addr = htobe64(addr); 486 } 487 488 static void 489 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, 490 bool is_tso, uint16_t len, uint64_t addr, 491 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) 492 { 493 seg_desc->type_flags = GVE_TXD_SEG; 494 if (is_tso) { 495 if (is_ipv6) 496 seg_desc->type_flags |= GVE_TXSF_IPV6; 497 seg_desc->l3_offset = l3_off >> 1; 498 seg_desc->mss = htobe16(tso_mss); 499 } 500 seg_desc->seg_len = htobe16(len); 501 seg_desc->seg_addr = htobe64(addr); 502 } 503 504 static inline uint32_t 505 gve_tx_avail(struct gve_tx_ring *tx) 506 { 507 return (tx->mask + 1 - (tx->req - tx->done)); 508 } 509 510 static bool 511 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) 512 { 513 return (atomic_load_int(&fifo->available) >= bytes); 514 } 515 516 static inline bool 517 gve_can_tx(struct gve_tx_ring *tx, int bytes_required) 518 { 519 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && 520 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); 521 } 522 523 static int 524 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) 525 { 526 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; 527 } 528 529 static inline int 530 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, 531 uint16_t pkt_len) 532 { 533 int pad_bytes, align_hdr_pad; 534 int bytes; 535 536 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 537 /* We need to take into account the header alignment padding. */ 538 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; 539 bytes = align_hdr_pad + pad_bytes + pkt_len; 540 541 return (bytes); 542 } 543 544 static int 545 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, 546 struct gve_tx_iovec iov[2]) 547 { 548 size_t overflow, padding; 549 uint32_t aligned_head; 550 int nfrags = 0; 551 552 if (bytes == 0) 553 return (0); 554 555 /* 556 * This check happens before we know how much padding is needed to 557 * align to a cacheline boundary for the payload, but that is fine, 558 * because the FIFO head always start aligned, and the FIFO's boundaries 559 * are aligned, so if there is space for the data, there is space for 560 * the padding to the next alignment. 561 */ 562 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), 563 ("Allocating gve tx fifo when there is no room")); 564 565 nfrags++; 566 567 iov[0].iov_offset = fifo->head; 568 iov[0].iov_len = bytes; 569 fifo->head += bytes; 570 571 if (fifo->head > fifo->size) { 572 /* 573 * If the allocation did not fit in the tail fragment of the 574 * FIFO, also use the head fragment. 575 */ 576 nfrags++; 577 overflow = fifo->head - fifo->size; 578 iov[0].iov_len -= overflow; 579 iov[1].iov_offset = 0; /* Start of fifo*/ 580 iov[1].iov_len = overflow; 581 582 fifo->head = overflow; 583 } 584 585 /* Re-align to a cacheline boundary */ 586 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); 587 padding = aligned_head - fifo->head; 588 iov[nfrags - 1].iov_padding = padding; 589 atomic_add_int(&fifo->available, -(bytes + padding)); 590 fifo->head = aligned_head; 591 592 if (fifo->head == fifo->size) 593 fifo->head = 0; 594 595 return (nfrags); 596 } 597 598 /* Only error this returns is ENOBUFS when the tx fifo is short of space */ 599 static int 600 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) 601 { 602 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; 603 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; 604 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; 605 int pad_bytes, hdr_nfrags, payload_nfrags; 606 struct gve_tx_pkt_desc *pkt_desc; 607 struct gve_tx_seg_desc *seg_desc; 608 struct gve_tx_mtd_desc *mtd_desc; 609 struct gve_tx_buffer_state *info; 610 uint32_t idx = tx->req & tx->mask; 611 struct ether_header *eh; 612 struct mbuf *mbuf_next; 613 int payload_iov = 2; 614 int bytes_required; 615 struct ip6_hdr *ip6; 616 struct tcphdr *th; 617 uint32_t next_idx; 618 uint8_t l3_off; 619 struct ip *ip; 620 int i; 621 622 info = &tx->info[idx]; 623 csum_flags = mbuf->m_pkthdr.csum_flags; 624 pkt_len = mbuf->m_pkthdr.len; 625 is_tso = csum_flags & CSUM_TSO; 626 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | 627 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); 628 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; 629 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; 630 631 eh = mtod(mbuf, struct ether_header *); 632 KASSERT(eh->ether_type != ETHERTYPE_VLAN, 633 ("VLAN-tagged packets not supported")); 634 635 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; 636 l3_off = ETHER_HDR_LEN; 637 mbuf_next = m_getptr(mbuf, l3_off, &offset); 638 639 if (is_ipv6) { 640 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); 641 l4_off = l3_off + sizeof(struct ip6_hdr); 642 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); 643 is_udp = (ip6->ip6_nxt == IPPROTO_UDP); 644 mbuf_next = m_getptr(mbuf, l4_off, &offset); 645 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { 646 ip = (struct ip *)(mtodo(mbuf_next, offset)); 647 l4_off = l3_off + (ip->ip_hl << 2); 648 is_tcp = (ip->ip_p == IPPROTO_TCP); 649 is_udp = (ip->ip_p == IPPROTO_UDP); 650 mbuf_next = m_getptr(mbuf, l4_off, &offset); 651 } 652 653 l4_data_off = 0; 654 if (is_tcp) { 655 th = (struct tcphdr *)(mtodo(mbuf_next, offset)); 656 l4_data_off = l4_off + (th->th_off << 2); 657 } else if (is_udp) 658 l4_data_off = l4_off + sizeof(struct udphdr); 659 660 if (has_csum_flag) { 661 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) 662 csum_offset = offsetof(struct tcphdr, th_sum); 663 else 664 csum_offset = offsetof(struct udphdr, uh_sum); 665 } 666 667 /* 668 * If this packet is neither a TCP nor a UDP packet, the first segment, 669 * the one represented by the packet descriptor, will carry the 670 * spec-stipulated minimum of 182B. 671 */ 672 if (l4_data_off != 0) 673 first_seg_len = l4_data_off; 674 else 675 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); 676 677 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); 678 if (__predict_false(!gve_can_tx(tx, bytes_required))) { 679 counter_enter(); 680 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); 681 counter_exit(); 682 return (ENOBUFS); 683 } 684 685 /* So that the cleanup taskqueue can free the mbuf eventually. */ 686 info->mbuf = mbuf; 687 688 /* 689 * We don't want to split the header, so if necessary, pad to the end 690 * of the fifo and then put the header at the beginning of the fifo. 691 */ 692 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 693 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, 694 &info->iov[0]); 695 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); 696 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, 697 &info->iov[payload_iov]); 698 699 pkt_desc = &tx->desc_ring[idx].pkt; 700 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 701 1 + mtd_desc_nr + payload_nfrags, first_seg_len, 702 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, 703 pkt_len); 704 705 m_copydata(mbuf, 0, first_seg_len, 706 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); 707 gve_dma_sync_for_device(tx->com.qpl, 708 info->iov[hdr_nfrags - 1].iov_offset, 709 info->iov[hdr_nfrags - 1].iov_len); 710 copy_offset = first_seg_len; 711 712 if (mtd_desc_nr == 1) { 713 next_idx = (tx->req + 1) & tx->mask; 714 mtd_desc = &tx->desc_ring[next_idx].mtd; 715 gve_tx_fill_mtd_desc(mtd_desc, mbuf); 716 } 717 718 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { 719 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; 720 seg_desc = &tx->desc_ring[next_idx].seg; 721 722 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, 723 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); 724 725 m_copydata(mbuf, copy_offset, info->iov[i].iov_len, 726 (char *)tx->fifo.base + info->iov[i].iov_offset); 727 gve_dma_sync_for_device(tx->com.qpl, 728 info->iov[i].iov_offset, info->iov[i].iov_len); 729 copy_offset += info->iov[i].iov_len; 730 } 731 732 tx->req += (1 + mtd_desc_nr + payload_nfrags); 733 if (is_tso) { 734 counter_enter(); 735 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); 736 counter_exit(); 737 } 738 return (0); 739 } 740 741 static int 742 gve_xmit_mbuf(struct gve_tx_ring *tx, 743 struct mbuf **mbuf) 744 { 745 if (gve_is_gqi(tx->com.priv)) 746 return (gve_xmit(tx, *mbuf)); 747 748 if (gve_is_qpl(tx->com.priv)) 749 return (gve_xmit_dqo_qpl(tx, *mbuf)); 750 751 /* 752 * gve_xmit_dqo might attempt to defrag the mbuf chain. 753 * The reference is passed in so that in the case of 754 * errors, the new mbuf chain is what's put back on the br. 755 */ 756 return (gve_xmit_dqo(tx, mbuf)); 757 } 758 759 /* 760 * Has the side-effect of stopping the xmit queue by setting tx->stopped 761 */ 762 static int 763 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, 764 struct mbuf **mbuf) 765 { 766 int err; 767 768 atomic_store_bool(&tx->stopped, true); 769 770 /* 771 * Room made in the queue BEFORE the barrier will be seen by the 772 * gve_xmit_mbuf retry below. 773 * 774 * If room is made in the queue AFTER the barrier, the cleanup tq 775 * iteration creating the room will either see a tx->stopped value 776 * of 0 or the 1 we just wrote: 777 * 778 * If it sees a 1, then it would enqueue the xmit tq. Enqueue 779 * implies a retry on the waiting pkt. 780 * 781 * If it sees a 0, then that implies a previous iteration overwrote 782 * our 1, and that iteration would enqueue the xmit tq. Enqueue 783 * implies a retry on the waiting pkt. 784 */ 785 atomic_thread_fence_seq_cst(); 786 787 err = gve_xmit_mbuf(tx, mbuf); 788 if (err == 0) 789 atomic_store_bool(&tx->stopped, false); 790 791 return (err); 792 } 793 794 static void 795 gve_xmit_br(struct gve_tx_ring *tx) 796 { 797 struct gve_priv *priv = tx->com.priv; 798 struct ifnet *ifp = priv->ifp; 799 struct mbuf *mbuf; 800 int err; 801 802 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && 803 (mbuf = drbr_peek(ifp, tx->br)) != NULL) { 804 err = gve_xmit_mbuf(tx, &mbuf); 805 806 /* 807 * We need to stop this taskqueue when we can't xmit the pkt due 808 * to lack of space in the NIC ring (ENOBUFS). The retry exists 809 * to guard against a TOCTTOU bug that could end up freezing the 810 * queue forever. 811 */ 812 if (__predict_false(mbuf != NULL && err == ENOBUFS)) 813 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); 814 815 if (__predict_false(err != 0 && mbuf != NULL)) { 816 if (err == EINVAL) { 817 drbr_advance(ifp, tx->br); 818 m_freem(mbuf); 819 } else 820 drbr_putback(ifp, tx->br, mbuf); 821 break; 822 } 823 824 drbr_advance(ifp, tx->br); 825 BPF_MTAP(ifp, mbuf); 826 827 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 828 BUS_DMASYNC_PREWRITE); 829 830 if (gve_is_gqi(priv)) 831 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); 832 else 833 gve_db_bar_dqo_write_4(priv, tx->com.db_offset, 834 tx->dqo.desc_tail); 835 } 836 } 837 838 void 839 gve_xmit_tq(void *arg, int pending) 840 { 841 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; 842 843 GVE_RING_LOCK(tx); 844 gve_xmit_br(tx); 845 GVE_RING_UNLOCK(tx); 846 } 847 848 static bool 849 is_vlan_tagged_pkt(struct mbuf *mbuf) 850 { 851 struct ether_header *eh; 852 853 eh = mtod(mbuf, struct ether_header *); 854 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); 855 } 856 857 int 858 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) 859 { 860 struct gve_priv *priv = if_getsoftc(ifp); 861 struct gve_tx_ring *tx; 862 bool is_br_empty; 863 int err; 864 uint32_t i; 865 866 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 867 return (ENODEV); 868 869 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) 870 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; 871 else 872 i = curcpu % priv->tx_cfg.num_queues; 873 tx = &priv->tx[i]; 874 875 if (__predict_false(is_vlan_tagged_pkt(mbuf))) { 876 counter_enter(); 877 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); 878 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 879 counter_exit(); 880 m_freem(mbuf); 881 return (ENODEV); 882 } 883 884 is_br_empty = drbr_empty(ifp, tx->br); 885 err = drbr_enqueue(ifp, tx->br, mbuf); 886 if (__predict_false(err != 0)) { 887 if (!atomic_load_bool(&tx->stopped)) 888 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 889 counter_enter(); 890 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); 891 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 892 counter_exit(); 893 return (err); 894 } 895 896 /* 897 * If the mbuf we just enqueued is the only one on the ring, then 898 * transmit it right away in the interests of low latency. 899 */ 900 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { 901 gve_xmit_br(tx); 902 GVE_RING_UNLOCK(tx); 903 } else if (!atomic_load_bool(&tx->stopped)) 904 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 905 906 return (0); 907 } 908 909 void 910 gve_qflush(if_t ifp) 911 { 912 struct gve_priv *priv = if_getsoftc(ifp); 913 struct gve_tx_ring *tx; 914 int i; 915 916 for (i = 0; i < priv->tx_cfg.num_queues; ++i) { 917 tx = &priv->tx[i]; 918 if (drbr_empty(ifp, tx->br) == 0) { 919 GVE_RING_LOCK(tx); 920 drbr_flush(ifp, tx->br); 921 GVE_RING_UNLOCK(tx); 922 } 923 } 924 925 if_qflush(ifp); 926 } 927