1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2023-2024 Google LLC 5 * 6 * Redistribution and use in source and binary forms, with or without modification, 7 * are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * 3. Neither the name of the copyright holder nor the names of its contributors 17 * may be used to endorse or promote products derived from this software without 18 * specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 #include "gve.h" 32 #include "gve_adminq.h" 33 #include "gve_dqo.h" 34 35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 36 37 static int 38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) 39 { 40 struct gve_queue_page_list *qpl = tx->com.qpl; 41 struct gve_tx_fifo *fifo = &tx->fifo; 42 43 fifo->size = qpl->num_pages * PAGE_SIZE; 44 fifo->base = qpl->kva; 45 atomic_store_int(&fifo->available, fifo->size); 46 fifo->head = 0; 47 48 return (0); 49 } 50 51 static void 52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i) 53 { 54 struct gve_tx_ring *tx = &priv->tx[i]; 55 56 if (tx->desc_ring != NULL) { 57 gve_dma_free_coherent(&tx->desc_ring_mem); 58 tx->desc_ring = NULL; 59 } 60 61 if (tx->info != NULL) { 62 free(tx->info, M_GVE); 63 tx->info = NULL; 64 } 65 } 66 67 static void 68 gve_tx_free_ring(struct gve_priv *priv, int i) 69 { 70 struct gve_tx_ring *tx = &priv->tx[i]; 71 struct gve_ring_com *com = &tx->com; 72 73 /* Safe to call even if never alloced */ 74 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 75 76 if (mtx_initialized(&tx->ring_mtx)) 77 mtx_destroy(&tx->ring_mtx); 78 79 if (com->q_resources != NULL) { 80 gve_dma_free_coherent(&com->q_resources_mem); 81 com->q_resources = NULL; 82 } 83 84 if (tx->br != NULL) { 85 buf_ring_free(tx->br, M_DEVBUF); 86 tx->br = NULL; 87 } 88 89 if (gve_is_gqi(priv)) 90 gve_tx_free_ring_gqi(priv, i); 91 else 92 gve_tx_free_ring_dqo(priv, i); 93 } 94 95 static int 96 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) 97 { 98 struct gve_tx_ring *tx = &priv->tx[i]; 99 struct gve_ring_com *com = &tx->com; 100 int err; 101 102 err = gve_dma_alloc_coherent(priv, 103 sizeof(union gve_tx_desc) * priv->tx_desc_cnt, 104 CACHE_LINE_SIZE, &tx->desc_ring_mem); 105 if (err != 0) { 106 device_printf(priv->dev, 107 "Failed to alloc desc ring for tx ring %d", i); 108 goto abort; 109 } 110 tx->desc_ring = tx->desc_ring_mem.cpu_addr; 111 112 com->qpl = &priv->qpls[i]; 113 if (com->qpl == NULL) { 114 device_printf(priv->dev, "No QPL left for tx ring %d\n", i); 115 err = ENOMEM; 116 goto abort; 117 } 118 119 err = gve_tx_fifo_init(priv, tx); 120 if (err != 0) 121 goto abort; 122 123 tx->info = malloc( 124 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, 125 M_GVE, M_WAITOK | M_ZERO); 126 return (0); 127 128 abort: 129 gve_tx_free_ring_gqi(priv, i); 130 return (err); 131 } 132 133 static int 134 gve_tx_alloc_ring(struct gve_priv *priv, int i) 135 { 136 struct gve_tx_ring *tx = &priv->tx[i]; 137 struct gve_ring_com *com = &tx->com; 138 char mtx_name[16]; 139 int err; 140 141 com->priv = priv; 142 com->id = i; 143 144 if (gve_is_gqi(priv)) 145 err = gve_tx_alloc_ring_gqi(priv, i); 146 else 147 err = gve_tx_alloc_ring_dqo(priv, i); 148 if (err != 0) 149 goto abort; 150 151 sprintf(mtx_name, "gvetx%d", i); 152 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); 153 154 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, 155 M_WAITOK, &tx->ring_mtx); 156 157 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 158 159 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), 160 PAGE_SIZE, &com->q_resources_mem); 161 if (err != 0) { 162 device_printf(priv->dev, 163 "Failed to alloc queue resources for tx ring %d", i); 164 goto abort; 165 } 166 com->q_resources = com->q_resources_mem.cpu_addr; 167 168 return (0); 169 170 abort: 171 gve_tx_free_ring(priv, i); 172 return (err); 173 } 174 175 int 176 gve_alloc_tx_rings(struct gve_priv *priv) 177 { 178 int err = 0; 179 int i; 180 181 priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, 182 M_GVE, M_WAITOK | M_ZERO); 183 184 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 185 err = gve_tx_alloc_ring(priv, i); 186 if (err != 0) 187 goto free_rings; 188 189 } 190 191 return (0); 192 193 free_rings: 194 while (i--) 195 gve_tx_free_ring(priv, i); 196 free(priv->tx, M_GVE); 197 return (err); 198 } 199 200 void 201 gve_free_tx_rings(struct gve_priv *priv) 202 { 203 int i; 204 205 for (i = 0; i < priv->tx_cfg.num_queues; i++) 206 gve_tx_free_ring(priv, i); 207 208 free(priv->tx, M_GVE); 209 } 210 211 static void 212 gve_tx_clear_desc_ring(struct gve_tx_ring *tx) 213 { 214 struct gve_ring_com *com = &tx->com; 215 int i; 216 217 for (i = 0; i < com->priv->tx_desc_cnt; i++) { 218 tx->desc_ring[i] = (union gve_tx_desc){}; 219 tx->info[i] = (struct gve_tx_buffer_state){}; 220 } 221 222 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 223 BUS_DMASYNC_PREWRITE); 224 } 225 226 static void 227 gve_clear_tx_ring(struct gve_priv *priv, int i) 228 { 229 struct gve_tx_ring *tx = &priv->tx[i]; 230 struct gve_tx_fifo *fifo = &tx->fifo; 231 232 tx->req = 0; 233 tx->done = 0; 234 tx->mask = priv->tx_desc_cnt - 1; 235 236 atomic_store_int(&fifo->available, fifo->size); 237 fifo->head = 0; 238 239 gve_tx_clear_desc_ring(tx); 240 } 241 242 static void 243 gve_start_tx_ring(struct gve_priv *priv, int i) 244 { 245 struct gve_tx_ring *tx = &priv->tx[i]; 246 struct gve_ring_com *com = &tx->com; 247 248 atomic_store_bool(&tx->stopped, false); 249 if (gve_is_gqi(priv)) 250 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); 251 else 252 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); 253 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, 254 taskqueue_thread_enqueue, &com->cleanup_tq); 255 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", 256 device_get_nameunit(priv->dev), i); 257 258 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); 259 tx->xmit_tq = taskqueue_create_fast("gve tx xmit", 260 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); 261 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", 262 device_get_nameunit(priv->dev), i); 263 } 264 265 int 266 gve_create_tx_rings(struct gve_priv *priv) 267 { 268 struct gve_ring_com *com; 269 struct gve_tx_ring *tx; 270 int err; 271 int i; 272 273 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) 274 return (0); 275 276 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 277 if (gve_is_gqi(priv)) 278 gve_clear_tx_ring(priv, i); 279 else 280 gve_clear_tx_ring_dqo(priv, i); 281 } 282 283 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); 284 if (err != 0) 285 return (err); 286 287 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, 288 BUS_DMASYNC_POSTREAD); 289 290 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 291 tx = &priv->tx[i]; 292 com = &tx->com; 293 294 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); 295 296 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, 297 BUS_DMASYNC_POSTREAD); 298 com->db_offset = 4 * be32toh(com->q_resources->db_index); 299 com->counter_idx = be32toh(com->q_resources->counter_index); 300 301 gve_start_tx_ring(priv, i); 302 } 303 304 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 305 return (0); 306 } 307 308 static void 309 gve_stop_tx_ring(struct gve_priv *priv, int i) 310 { 311 struct gve_tx_ring *tx = &priv->tx[i]; 312 struct gve_ring_com *com = &tx->com; 313 314 if (com->cleanup_tq != NULL) { 315 taskqueue_quiesce(com->cleanup_tq); 316 taskqueue_free(com->cleanup_tq); 317 com->cleanup_tq = NULL; 318 } 319 320 if (tx->xmit_tq != NULL) { 321 taskqueue_quiesce(tx->xmit_tq); 322 taskqueue_free(tx->xmit_tq); 323 tx->xmit_tq = NULL; 324 } 325 } 326 327 int 328 gve_destroy_tx_rings(struct gve_priv *priv) 329 { 330 int err; 331 int i; 332 333 for (i = 0; i < priv->tx_cfg.num_queues; i++) 334 gve_stop_tx_ring(priv, i); 335 336 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { 337 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); 338 if (err != 0) 339 return (err); 340 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 341 } 342 343 return (0); 344 } 345 346 int 347 gve_tx_intr(void *arg) 348 { 349 struct gve_tx_ring *tx = arg; 350 struct gve_priv *priv = tx->com.priv; 351 struct gve_ring_com *com = &tx->com; 352 353 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 354 return (FILTER_STRAY); 355 356 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); 357 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); 358 return (FILTER_HANDLED); 359 } 360 361 static uint32_t 362 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) 363 { 364 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, 365 BUS_DMASYNC_POSTREAD); 366 uint32_t counter = priv->counters[tx->com.counter_idx]; 367 return (be32toh(counter)); 368 } 369 370 static void 371 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) 372 { 373 atomic_add_int(&fifo->available, bytes); 374 } 375 376 void 377 gve_tx_cleanup_tq(void *arg, int pending) 378 { 379 struct gve_tx_ring *tx = arg; 380 struct gve_priv *priv = tx->com.priv; 381 uint32_t nic_done = gve_tx_load_event_counter(priv, tx); 382 uint32_t todo = nic_done - tx->done; 383 size_t space_freed = 0; 384 int i, j; 385 386 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 387 return; 388 389 for (j = 0; j < todo; j++) { 390 uint32_t idx = tx->done & tx->mask; 391 struct gve_tx_buffer_state *info = &tx->info[idx]; 392 struct mbuf *mbuf = info->mbuf; 393 394 tx->done++; 395 if (mbuf == NULL) 396 continue; 397 398 info->mbuf = NULL; 399 counter_enter(); 400 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); 401 counter_u64_add_protected(tx->stats.tpackets, 1); 402 counter_exit(); 403 m_freem(mbuf); 404 405 for (i = 0; i < GVE_TX_MAX_DESCS; i++) { 406 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; 407 info->iov[i].iov_len = 0; 408 info->iov[i].iov_padding = 0; 409 } 410 } 411 412 gve_tx_free_fifo(&tx->fifo, space_freed); 413 414 gve_db_bar_write_4(priv, tx->com.irq_db_offset, 415 GVE_IRQ_ACK | GVE_IRQ_EVENT); 416 417 /* 418 * Completions born before this barrier MAY NOT cause the NIC to send an 419 * interrupt but they will still be handled by the enqueue below. 420 * Completions born after the barrier WILL trigger an interrupt. 421 */ 422 atomic_thread_fence_seq_cst(); 423 424 nic_done = gve_tx_load_event_counter(priv, tx); 425 todo = nic_done - tx->done; 426 if (todo != 0) { 427 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); 428 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); 429 } 430 431 if (atomic_load_bool(&tx->stopped) && space_freed) { 432 atomic_store_bool(&tx->stopped, false); 433 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 434 } 435 } 436 437 static void 438 gve_dma_sync_for_device(struct gve_queue_page_list *qpl, 439 uint64_t iov_offset, uint64_t iov_len) 440 { 441 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; 442 uint64_t first_page = iov_offset / PAGE_SIZE; 443 struct gve_dma_handle *dma; 444 uint64_t page; 445 446 for (page = first_page; page <= last_page; page++) { 447 dma = &(qpl->dmas[page]); 448 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); 449 } 450 } 451 452 static void 453 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) 454 { 455 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; 456 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; 457 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); 458 mtd_desc->reserved0 = 0; 459 mtd_desc->reserved1 = 0; 460 } 461 462 static void 463 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, 464 uint16_t l4_hdr_offset, uint32_t desc_cnt, 465 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, 466 int csum_offset, uint16_t pkt_len) 467 { 468 if (is_tso) { 469 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; 470 pkt_desc->l4_csum_offset = csum_offset >> 1; 471 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 472 } else if (has_csum_flag) { 473 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; 474 pkt_desc->l4_csum_offset = csum_offset >> 1; 475 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 476 } else { 477 pkt_desc->type_flags = GVE_TXD_STD; 478 pkt_desc->l4_csum_offset = 0; 479 pkt_desc->l4_hdr_offset = 0; 480 } 481 pkt_desc->desc_cnt = desc_cnt; 482 pkt_desc->len = htobe16(pkt_len); 483 pkt_desc->seg_len = htobe16(first_seg_len); 484 pkt_desc->seg_addr = htobe64(addr); 485 } 486 487 static void 488 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, 489 bool is_tso, uint16_t len, uint64_t addr, 490 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) 491 { 492 seg_desc->type_flags = GVE_TXD_SEG; 493 if (is_tso) { 494 if (is_ipv6) 495 seg_desc->type_flags |= GVE_TXSF_IPV6; 496 seg_desc->l3_offset = l3_off >> 1; 497 seg_desc->mss = htobe16(tso_mss); 498 } 499 seg_desc->seg_len = htobe16(len); 500 seg_desc->seg_addr = htobe64(addr); 501 } 502 503 static inline uint32_t 504 gve_tx_avail(struct gve_tx_ring *tx) 505 { 506 return (tx->mask + 1 - (tx->req - tx->done)); 507 } 508 509 static bool 510 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) 511 { 512 return (atomic_load_int(&fifo->available) >= bytes); 513 } 514 515 static inline bool 516 gve_can_tx(struct gve_tx_ring *tx, int bytes_required) 517 { 518 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && 519 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); 520 } 521 522 static int 523 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) 524 { 525 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; 526 } 527 528 static inline int 529 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, 530 uint16_t pkt_len) 531 { 532 int pad_bytes, align_hdr_pad; 533 int bytes; 534 535 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 536 /* We need to take into account the header alignment padding. */ 537 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; 538 bytes = align_hdr_pad + pad_bytes + pkt_len; 539 540 return (bytes); 541 } 542 543 static int 544 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, 545 struct gve_tx_iovec iov[2]) 546 { 547 size_t overflow, padding; 548 uint32_t aligned_head; 549 int nfrags = 0; 550 551 if (bytes == 0) 552 return (0); 553 554 /* 555 * This check happens before we know how much padding is needed to 556 * align to a cacheline boundary for the payload, but that is fine, 557 * because the FIFO head always start aligned, and the FIFO's boundaries 558 * are aligned, so if there is space for the data, there is space for 559 * the padding to the next alignment. 560 */ 561 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), 562 ("Allocating gve tx fifo when there is no room")); 563 564 nfrags++; 565 566 iov[0].iov_offset = fifo->head; 567 iov[0].iov_len = bytes; 568 fifo->head += bytes; 569 570 if (fifo->head > fifo->size) { 571 /* 572 * If the allocation did not fit in the tail fragment of the 573 * FIFO, also use the head fragment. 574 */ 575 nfrags++; 576 overflow = fifo->head - fifo->size; 577 iov[0].iov_len -= overflow; 578 iov[1].iov_offset = 0; /* Start of fifo*/ 579 iov[1].iov_len = overflow; 580 581 fifo->head = overflow; 582 } 583 584 /* Re-align to a cacheline boundary */ 585 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); 586 padding = aligned_head - fifo->head; 587 iov[nfrags - 1].iov_padding = padding; 588 atomic_add_int(&fifo->available, -(bytes + padding)); 589 fifo->head = aligned_head; 590 591 if (fifo->head == fifo->size) 592 fifo->head = 0; 593 594 return (nfrags); 595 } 596 597 /* Only error this returns is ENOBUFS when the tx fifo is short of space */ 598 static int 599 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) 600 { 601 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; 602 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; 603 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; 604 int pad_bytes, hdr_nfrags, payload_nfrags; 605 struct gve_tx_pkt_desc *pkt_desc; 606 struct gve_tx_seg_desc *seg_desc; 607 struct gve_tx_mtd_desc *mtd_desc; 608 struct gve_tx_buffer_state *info; 609 uint32_t idx = tx->req & tx->mask; 610 struct ether_header *eh; 611 struct mbuf *mbuf_next; 612 int payload_iov = 2; 613 int bytes_required; 614 struct ip6_hdr *ip6; 615 struct tcphdr *th; 616 uint32_t next_idx; 617 uint8_t l3_off; 618 struct ip *ip; 619 int i; 620 621 info = &tx->info[idx]; 622 csum_flags = mbuf->m_pkthdr.csum_flags; 623 pkt_len = mbuf->m_pkthdr.len; 624 is_tso = csum_flags & CSUM_TSO; 625 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | 626 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); 627 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; 628 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; 629 630 eh = mtod(mbuf, struct ether_header *); 631 KASSERT(eh->ether_type != ETHERTYPE_VLAN, 632 ("VLAN-tagged packets not supported")); 633 634 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; 635 l3_off = ETHER_HDR_LEN; 636 mbuf_next = m_getptr(mbuf, l3_off, &offset); 637 638 if (is_ipv6) { 639 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); 640 l4_off = l3_off + sizeof(struct ip6_hdr); 641 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); 642 is_udp = (ip6->ip6_nxt == IPPROTO_UDP); 643 mbuf_next = m_getptr(mbuf, l4_off, &offset); 644 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { 645 ip = (struct ip *)(mtodo(mbuf_next, offset)); 646 l4_off = l3_off + (ip->ip_hl << 2); 647 is_tcp = (ip->ip_p == IPPROTO_TCP); 648 is_udp = (ip->ip_p == IPPROTO_UDP); 649 mbuf_next = m_getptr(mbuf, l4_off, &offset); 650 } 651 652 l4_data_off = 0; 653 if (is_tcp) { 654 th = (struct tcphdr *)(mtodo(mbuf_next, offset)); 655 l4_data_off = l4_off + (th->th_off << 2); 656 } else if (is_udp) 657 l4_data_off = l4_off + sizeof(struct udphdr); 658 659 if (has_csum_flag) { 660 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) 661 csum_offset = offsetof(struct tcphdr, th_sum); 662 else 663 csum_offset = offsetof(struct udphdr, uh_sum); 664 } 665 666 /* 667 * If this packet is neither a TCP nor a UDP packet, the first segment, 668 * the one represented by the packet descriptor, will carry the 669 * spec-stipulated minimum of 182B. 670 */ 671 if (l4_data_off != 0) 672 first_seg_len = l4_data_off; 673 else 674 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); 675 676 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); 677 if (__predict_false(!gve_can_tx(tx, bytes_required))) { 678 counter_enter(); 679 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); 680 counter_exit(); 681 return (ENOBUFS); 682 } 683 684 /* So that the cleanup taskqueue can free the mbuf eventually. */ 685 info->mbuf = mbuf; 686 687 /* 688 * We don't want to split the header, so if necessary, pad to the end 689 * of the fifo and then put the header at the beginning of the fifo. 690 */ 691 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 692 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, 693 &info->iov[0]); 694 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); 695 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, 696 &info->iov[payload_iov]); 697 698 pkt_desc = &tx->desc_ring[idx].pkt; 699 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 700 1 + mtd_desc_nr + payload_nfrags, first_seg_len, 701 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, 702 pkt_len); 703 704 m_copydata(mbuf, 0, first_seg_len, 705 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); 706 gve_dma_sync_for_device(tx->com.qpl, 707 info->iov[hdr_nfrags - 1].iov_offset, 708 info->iov[hdr_nfrags - 1].iov_len); 709 copy_offset = first_seg_len; 710 711 if (mtd_desc_nr == 1) { 712 next_idx = (tx->req + 1) & tx->mask; 713 mtd_desc = &tx->desc_ring[next_idx].mtd; 714 gve_tx_fill_mtd_desc(mtd_desc, mbuf); 715 } 716 717 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { 718 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; 719 seg_desc = &tx->desc_ring[next_idx].seg; 720 721 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, 722 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); 723 724 m_copydata(mbuf, copy_offset, info->iov[i].iov_len, 725 (char *)tx->fifo.base + info->iov[i].iov_offset); 726 gve_dma_sync_for_device(tx->com.qpl, 727 info->iov[i].iov_offset, info->iov[i].iov_len); 728 copy_offset += info->iov[i].iov_len; 729 } 730 731 tx->req += (1 + mtd_desc_nr + payload_nfrags); 732 if (is_tso) { 733 counter_enter(); 734 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); 735 counter_exit(); 736 } 737 return (0); 738 } 739 740 static int 741 gve_xmit_mbuf(struct gve_tx_ring *tx, 742 struct mbuf **mbuf) 743 { 744 if (gve_is_gqi(tx->com.priv)) 745 return (gve_xmit(tx, *mbuf)); 746 747 if (gve_is_qpl(tx->com.priv)) 748 return (gve_xmit_dqo_qpl(tx, *mbuf)); 749 750 /* 751 * gve_xmit_dqo might attempt to defrag the mbuf chain. 752 * The reference is passed in so that in the case of 753 * errors, the new mbuf chain is what's put back on the br. 754 */ 755 return (gve_xmit_dqo(tx, mbuf)); 756 } 757 758 /* 759 * Has the side-effect of stopping the xmit queue by setting tx->stopped 760 */ 761 static int 762 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, 763 struct mbuf **mbuf) 764 { 765 int err; 766 767 atomic_store_bool(&tx->stopped, true); 768 769 /* 770 * Room made in the queue BEFORE the barrier will be seen by the 771 * gve_xmit_mbuf retry below. 772 * 773 * If room is made in the queue AFTER the barrier, the cleanup tq 774 * iteration creating the room will either see a tx->stopped value 775 * of 0 or the 1 we just wrote: 776 * 777 * If it sees a 1, then it would enqueue the xmit tq. Enqueue 778 * implies a retry on the waiting pkt. 779 * 780 * If it sees a 0, then that implies a previous iteration overwrote 781 * our 1, and that iteration would enqueue the xmit tq. Enqueue 782 * implies a retry on the waiting pkt. 783 */ 784 atomic_thread_fence_seq_cst(); 785 786 err = gve_xmit_mbuf(tx, mbuf); 787 if (err == 0) 788 atomic_store_bool(&tx->stopped, false); 789 790 return (err); 791 } 792 793 static void 794 gve_xmit_br(struct gve_tx_ring *tx) 795 { 796 struct gve_priv *priv = tx->com.priv; 797 struct ifnet *ifp = priv->ifp; 798 struct mbuf *mbuf; 799 int err; 800 801 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && 802 (mbuf = drbr_peek(ifp, tx->br)) != NULL) { 803 err = gve_xmit_mbuf(tx, &mbuf); 804 805 /* 806 * We need to stop this taskqueue when we can't xmit the pkt due 807 * to lack of space in the NIC ring (ENOBUFS). The retry exists 808 * to guard against a TOCTTOU bug that could end up freezing the 809 * queue forever. 810 */ 811 if (__predict_false(mbuf != NULL && err == ENOBUFS)) 812 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); 813 814 if (__predict_false(err != 0 && mbuf != NULL)) { 815 if (err == EINVAL) { 816 drbr_advance(ifp, tx->br); 817 m_freem(mbuf); 818 } else 819 drbr_putback(ifp, tx->br, mbuf); 820 break; 821 } 822 823 drbr_advance(ifp, tx->br); 824 BPF_MTAP(ifp, mbuf); 825 826 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 827 BUS_DMASYNC_PREWRITE); 828 829 if (gve_is_gqi(priv)) 830 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); 831 else 832 gve_db_bar_dqo_write_4(priv, tx->com.db_offset, 833 tx->dqo.desc_tail); 834 } 835 } 836 837 void 838 gve_xmit_tq(void *arg, int pending) 839 { 840 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; 841 842 GVE_RING_LOCK(tx); 843 gve_xmit_br(tx); 844 GVE_RING_UNLOCK(tx); 845 } 846 847 static bool 848 is_vlan_tagged_pkt(struct mbuf *mbuf) 849 { 850 struct ether_header *eh; 851 852 eh = mtod(mbuf, struct ether_header *); 853 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); 854 } 855 856 int 857 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) 858 { 859 struct gve_priv *priv = if_getsoftc(ifp); 860 struct gve_tx_ring *tx; 861 bool is_br_empty; 862 int err; 863 uint32_t i; 864 865 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 866 return (ENODEV); 867 868 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) 869 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; 870 else 871 i = curcpu % priv->tx_cfg.num_queues; 872 tx = &priv->tx[i]; 873 874 if (__predict_false(is_vlan_tagged_pkt(mbuf))) { 875 counter_enter(); 876 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); 877 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 878 counter_exit(); 879 m_freem(mbuf); 880 return (ENODEV); 881 } 882 883 is_br_empty = drbr_empty(ifp, tx->br); 884 err = drbr_enqueue(ifp, tx->br, mbuf); 885 if (__predict_false(err != 0)) { 886 if (!atomic_load_bool(&tx->stopped)) 887 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 888 counter_enter(); 889 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); 890 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 891 counter_exit(); 892 return (err); 893 } 894 895 /* 896 * If the mbuf we just enqueued is the only one on the ring, then 897 * transmit it right away in the interests of low latency. 898 */ 899 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { 900 gve_xmit_br(tx); 901 GVE_RING_UNLOCK(tx); 902 } else if (!atomic_load_bool(&tx->stopped)) 903 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 904 905 return (0); 906 } 907 908 void 909 gve_qflush(if_t ifp) 910 { 911 struct gve_priv *priv = if_getsoftc(ifp); 912 struct gve_tx_ring *tx; 913 int i; 914 915 for (i = 0; i < priv->tx_cfg.num_queues; ++i) { 916 tx = &priv->tx[i]; 917 if (drbr_empty(ifp, tx->br) == 0) { 918 GVE_RING_LOCK(tx); 919 drbr_flush(ifp, tx->br); 920 GVE_RING_UNLOCK(tx); 921 } 922 } 923 924 if_qflush(ifp); 925 } 926