1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2023-2024 Google LLC 5 * 6 * Redistribution and use in source and binary forms, with or without modification, 7 * are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * 3. Neither the name of the copyright holder nor the names of its contributors 17 * may be used to endorse or promote products derived from this software without 18 * specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 #include "gve.h" 32 #include "gve_adminq.h" 33 #include "gve_dqo.h" 34 35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 36 37 static int 38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) 39 { 40 struct gve_queue_page_list *qpl = tx->com.qpl; 41 struct gve_tx_fifo *fifo = &tx->fifo; 42 43 fifo->size = qpl->num_pages * PAGE_SIZE; 44 fifo->base = qpl->kva; 45 atomic_store_int(&fifo->available, fifo->size); 46 fifo->head = 0; 47 48 return (0); 49 } 50 51 static void 52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i) 53 { 54 struct gve_tx_ring *tx = &priv->tx[i]; 55 56 if (tx->desc_ring != NULL) { 57 gve_dma_free_coherent(&tx->desc_ring_mem); 58 tx->desc_ring = NULL; 59 } 60 61 if (tx->info != NULL) { 62 free(tx->info, M_GVE); 63 tx->info = NULL; 64 } 65 } 66 67 static void 68 gve_tx_free_ring(struct gve_priv *priv, int i) 69 { 70 struct gve_tx_ring *tx = &priv->tx[i]; 71 struct gve_ring_com *com = &tx->com; 72 73 /* Safe to call even if never alloced */ 74 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 75 76 if (mtx_initialized(&tx->ring_mtx)) 77 mtx_destroy(&tx->ring_mtx); 78 79 if (com->q_resources != NULL) { 80 gve_dma_free_coherent(&com->q_resources_mem); 81 com->q_resources = NULL; 82 } 83 84 if (tx->br != NULL) { 85 buf_ring_free(tx->br, M_DEVBUF); 86 tx->br = NULL; 87 } 88 89 if (gve_is_gqi(priv)) 90 gve_tx_free_ring_gqi(priv, i); 91 else 92 gve_tx_free_ring_dqo(priv, i); 93 } 94 95 static int 96 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) 97 { 98 struct gve_tx_ring *tx = &priv->tx[i]; 99 struct gve_ring_com *com = &tx->com; 100 int err; 101 102 err = gve_dma_alloc_coherent(priv, 103 sizeof(union gve_tx_desc) * priv->tx_desc_cnt, 104 CACHE_LINE_SIZE, &tx->desc_ring_mem); 105 if (err != 0) { 106 device_printf(priv->dev, 107 "Failed to alloc desc ring for tx ring %d", i); 108 goto abort; 109 } 110 tx->desc_ring = tx->desc_ring_mem.cpu_addr; 111 112 com->qpl = &priv->qpls[i]; 113 if (com->qpl == NULL) { 114 device_printf(priv->dev, "No QPL left for tx ring %d\n", i); 115 err = ENOMEM; 116 goto abort; 117 } 118 119 err = gve_tx_fifo_init(priv, tx); 120 if (err != 0) 121 goto abort; 122 123 tx->info = malloc( 124 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, 125 M_GVE, M_WAITOK | M_ZERO); 126 return (0); 127 128 abort: 129 gve_tx_free_ring_gqi(priv, i); 130 return (err); 131 } 132 133 static int 134 gve_tx_alloc_ring(struct gve_priv *priv, int i) 135 { 136 struct gve_tx_ring *tx = &priv->tx[i]; 137 struct gve_ring_com *com = &tx->com; 138 char mtx_name[16]; 139 int err; 140 141 com->priv = priv; 142 com->id = i; 143 144 if (gve_is_gqi(priv)) 145 err = gve_tx_alloc_ring_gqi(priv, i); 146 else 147 err = gve_tx_alloc_ring_dqo(priv, i); 148 if (err != 0) 149 goto abort; 150 151 sprintf(mtx_name, "gvetx%d", i); 152 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); 153 154 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, 155 M_WAITOK, &tx->ring_mtx); 156 157 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); 158 159 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), 160 PAGE_SIZE, &com->q_resources_mem); 161 if (err != 0) { 162 device_printf(priv->dev, 163 "Failed to alloc queue resources for tx ring %d", i); 164 goto abort; 165 } 166 com->q_resources = com->q_resources_mem.cpu_addr; 167 168 return (0); 169 170 abort: 171 gve_tx_free_ring(priv, i); 172 return (err); 173 } 174 175 int 176 gve_alloc_tx_rings(struct gve_priv *priv) 177 { 178 int err = 0; 179 int i; 180 181 priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, 182 M_GVE, M_WAITOK | M_ZERO); 183 184 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 185 err = gve_tx_alloc_ring(priv, i); 186 if (err != 0) 187 goto free_rings; 188 189 } 190 191 return (0); 192 193 free_rings: 194 while (i--) 195 gve_tx_free_ring(priv, i); 196 free(priv->tx, M_GVE); 197 return (err); 198 } 199 200 void 201 gve_free_tx_rings(struct gve_priv *priv) 202 { 203 int i; 204 205 for (i = 0; i < priv->tx_cfg.num_queues; i++) 206 gve_tx_free_ring(priv, i); 207 208 free(priv->tx, M_GVE); 209 } 210 211 static void 212 gve_tx_clear_desc_ring(struct gve_tx_ring *tx) 213 { 214 struct gve_ring_com *com = &tx->com; 215 int i; 216 217 for (i = 0; i < com->priv->tx_desc_cnt; i++) { 218 tx->desc_ring[i] = (union gve_tx_desc){}; 219 tx->info[i] = (struct gve_tx_buffer_state){}; 220 } 221 222 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 223 BUS_DMASYNC_PREWRITE); 224 } 225 226 static void 227 gve_clear_tx_ring(struct gve_priv *priv, int i) 228 { 229 struct gve_tx_ring *tx = &priv->tx[i]; 230 struct gve_tx_fifo *fifo = &tx->fifo; 231 232 tx->req = 0; 233 tx->done = 0; 234 tx->mask = priv->tx_desc_cnt - 1; 235 236 atomic_store_int(&fifo->available, fifo->size); 237 fifo->head = 0; 238 239 gve_tx_clear_desc_ring(tx); 240 } 241 242 static void 243 gve_start_tx_ring(struct gve_priv *priv, int i, 244 void (cleanup) (void *arg, int pending)) 245 { 246 struct gve_tx_ring *tx = &priv->tx[i]; 247 struct gve_ring_com *com = &tx->com; 248 249 atomic_store_bool(&tx->stopped, false); 250 251 NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx); 252 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, 253 taskqueue_thread_enqueue, &com->cleanup_tq); 254 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", 255 device_get_nameunit(priv->dev), i); 256 257 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); 258 tx->xmit_tq = taskqueue_create_fast("gve tx xmit", 259 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); 260 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", 261 device_get_nameunit(priv->dev), i); 262 } 263 264 int 265 gve_create_tx_rings(struct gve_priv *priv) 266 { 267 struct gve_ring_com *com; 268 struct gve_tx_ring *tx; 269 int err; 270 int i; 271 272 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) 273 return (0); 274 275 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 276 if (gve_is_gqi(priv)) 277 gve_clear_tx_ring(priv, i); 278 else 279 gve_clear_tx_ring_dqo(priv, i); 280 } 281 282 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); 283 if (err != 0) 284 return (err); 285 286 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, 287 BUS_DMASYNC_POSTREAD); 288 289 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 290 tx = &priv->tx[i]; 291 com = &tx->com; 292 293 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); 294 295 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, 296 BUS_DMASYNC_POSTREAD); 297 com->db_offset = 4 * be32toh(com->q_resources->db_index); 298 com->counter_idx = be32toh(com->q_resources->counter_index); 299 300 if (gve_is_gqi(priv)) 301 gve_start_tx_ring(priv, i, gve_tx_cleanup_tq); 302 else 303 gve_start_tx_ring(priv, i, gve_tx_cleanup_tq_dqo); 304 } 305 306 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 307 return (0); 308 } 309 310 static void 311 gve_stop_tx_ring(struct gve_priv *priv, int i) 312 { 313 struct gve_tx_ring *tx = &priv->tx[i]; 314 struct gve_ring_com *com = &tx->com; 315 316 if (com->cleanup_tq != NULL) { 317 taskqueue_quiesce(com->cleanup_tq); 318 taskqueue_free(com->cleanup_tq); 319 com->cleanup_tq = NULL; 320 } 321 322 if (tx->xmit_tq != NULL) { 323 taskqueue_quiesce(tx->xmit_tq); 324 taskqueue_free(tx->xmit_tq); 325 tx->xmit_tq = NULL; 326 } 327 } 328 329 int 330 gve_destroy_tx_rings(struct gve_priv *priv) 331 { 332 int err; 333 int i; 334 335 for (i = 0; i < priv->tx_cfg.num_queues; i++) 336 gve_stop_tx_ring(priv, i); 337 338 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { 339 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); 340 if (err != 0) 341 return (err); 342 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); 343 } 344 345 return (0); 346 } 347 348 int 349 gve_tx_intr(void *arg) 350 { 351 struct gve_tx_ring *tx = arg; 352 struct gve_priv *priv = tx->com.priv; 353 struct gve_ring_com *com = &tx->com; 354 355 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 356 return (FILTER_STRAY); 357 358 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); 359 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); 360 return (FILTER_HANDLED); 361 } 362 363 static uint32_t 364 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) 365 { 366 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, 367 BUS_DMASYNC_POSTREAD); 368 uint32_t counter = priv->counters[tx->com.counter_idx]; 369 return (be32toh(counter)); 370 } 371 372 static void 373 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) 374 { 375 atomic_add_int(&fifo->available, bytes); 376 } 377 378 void 379 gve_tx_cleanup_tq(void *arg, int pending) 380 { 381 struct gve_tx_ring *tx = arg; 382 struct gve_priv *priv = tx->com.priv; 383 uint32_t nic_done = gve_tx_load_event_counter(priv, tx); 384 uint32_t todo = nic_done - tx->done; 385 size_t space_freed = 0; 386 int i, j; 387 388 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 389 return; 390 391 for (j = 0; j < todo; j++) { 392 uint32_t idx = tx->done & tx->mask; 393 struct gve_tx_buffer_state *info = &tx->info[idx]; 394 struct mbuf *mbuf = info->mbuf; 395 396 tx->done++; 397 if (mbuf == NULL) 398 continue; 399 400 info->mbuf = NULL; 401 counter_enter(); 402 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); 403 counter_u64_add_protected(tx->stats.tpackets, 1); 404 counter_exit(); 405 m_freem(mbuf); 406 407 for (i = 0; i < GVE_TX_MAX_DESCS; i++) { 408 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; 409 info->iov[i].iov_len = 0; 410 info->iov[i].iov_padding = 0; 411 } 412 } 413 414 gve_tx_free_fifo(&tx->fifo, space_freed); 415 416 gve_db_bar_write_4(priv, tx->com.irq_db_offset, 417 GVE_IRQ_ACK | GVE_IRQ_EVENT); 418 419 /* 420 * Completions born before this barrier MAY NOT cause the NIC to send an 421 * interrupt but they will still be handled by the enqueue below. 422 * Completions born after the barrier WILL trigger an interrupt. 423 */ 424 mb(); 425 426 nic_done = gve_tx_load_event_counter(priv, tx); 427 todo = nic_done - tx->done; 428 if (todo != 0) { 429 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); 430 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); 431 } 432 433 if (atomic_load_bool(&tx->stopped) && space_freed) { 434 atomic_store_bool(&tx->stopped, false); 435 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 436 } 437 } 438 439 static void 440 gve_dma_sync_for_device(struct gve_queue_page_list *qpl, 441 uint64_t iov_offset, uint64_t iov_len) 442 { 443 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; 444 uint64_t first_page = iov_offset / PAGE_SIZE; 445 struct gve_dma_handle *dma; 446 uint64_t page; 447 448 for (page = first_page; page <= last_page; page++) { 449 dma = &(qpl->dmas[page]); 450 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); 451 } 452 } 453 454 static void 455 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) 456 { 457 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; 458 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; 459 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); 460 mtd_desc->reserved0 = 0; 461 mtd_desc->reserved1 = 0; 462 } 463 464 static void 465 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, 466 uint16_t l4_hdr_offset, uint32_t desc_cnt, 467 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, 468 int csum_offset, uint16_t pkt_len) 469 { 470 if (is_tso) { 471 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; 472 pkt_desc->l4_csum_offset = csum_offset >> 1; 473 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 474 } else if (has_csum_flag) { 475 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; 476 pkt_desc->l4_csum_offset = csum_offset >> 1; 477 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; 478 } else { 479 pkt_desc->type_flags = GVE_TXD_STD; 480 pkt_desc->l4_csum_offset = 0; 481 pkt_desc->l4_hdr_offset = 0; 482 } 483 pkt_desc->desc_cnt = desc_cnt; 484 pkt_desc->len = htobe16(pkt_len); 485 pkt_desc->seg_len = htobe16(first_seg_len); 486 pkt_desc->seg_addr = htobe64(addr); 487 } 488 489 static void 490 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, 491 bool is_tso, uint16_t len, uint64_t addr, 492 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) 493 { 494 seg_desc->type_flags = GVE_TXD_SEG; 495 if (is_tso) { 496 if (is_ipv6) 497 seg_desc->type_flags |= GVE_TXSF_IPV6; 498 seg_desc->l3_offset = l3_off >> 1; 499 seg_desc->mss = htobe16(tso_mss); 500 } 501 seg_desc->seg_len = htobe16(len); 502 seg_desc->seg_addr = htobe64(addr); 503 } 504 505 static inline uint32_t 506 gve_tx_avail(struct gve_tx_ring *tx) 507 { 508 return (tx->mask + 1 - (tx->req - tx->done)); 509 } 510 511 static bool 512 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) 513 { 514 return (atomic_load_int(&fifo->available) >= bytes); 515 } 516 517 static inline bool 518 gve_can_tx(struct gve_tx_ring *tx, int bytes_required) 519 { 520 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && 521 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); 522 } 523 524 static int 525 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) 526 { 527 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; 528 } 529 530 static inline int 531 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, 532 uint16_t pkt_len) 533 { 534 int pad_bytes, align_hdr_pad; 535 int bytes; 536 537 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 538 /* We need to take into account the header alignment padding. */ 539 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; 540 bytes = align_hdr_pad + pad_bytes + pkt_len; 541 542 return (bytes); 543 } 544 545 static int 546 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, 547 struct gve_tx_iovec iov[2]) 548 { 549 size_t overflow, padding; 550 uint32_t aligned_head; 551 int nfrags = 0; 552 553 if (bytes == 0) 554 return (0); 555 556 /* 557 * This check happens before we know how much padding is needed to 558 * align to a cacheline boundary for the payload, but that is fine, 559 * because the FIFO head always start aligned, and the FIFO's boundaries 560 * are aligned, so if there is space for the data, there is space for 561 * the padding to the next alignment. 562 */ 563 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), 564 ("Allocating gve tx fifo when there is no room")); 565 566 nfrags++; 567 568 iov[0].iov_offset = fifo->head; 569 iov[0].iov_len = bytes; 570 fifo->head += bytes; 571 572 if (fifo->head > fifo->size) { 573 /* 574 * If the allocation did not fit in the tail fragment of the 575 * FIFO, also use the head fragment. 576 */ 577 nfrags++; 578 overflow = fifo->head - fifo->size; 579 iov[0].iov_len -= overflow; 580 iov[1].iov_offset = 0; /* Start of fifo*/ 581 iov[1].iov_len = overflow; 582 583 fifo->head = overflow; 584 } 585 586 /* Re-align to a cacheline boundary */ 587 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); 588 padding = aligned_head - fifo->head; 589 iov[nfrags - 1].iov_padding = padding; 590 atomic_add_int(&fifo->available, -(bytes + padding)); 591 fifo->head = aligned_head; 592 593 if (fifo->head == fifo->size) 594 fifo->head = 0; 595 596 return (nfrags); 597 } 598 599 /* Only error this returns is ENOBUFS when the tx fifo is short of space */ 600 static int 601 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) 602 { 603 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; 604 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; 605 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; 606 int pad_bytes, hdr_nfrags, payload_nfrags; 607 struct gve_tx_pkt_desc *pkt_desc; 608 struct gve_tx_seg_desc *seg_desc; 609 struct gve_tx_mtd_desc *mtd_desc; 610 struct gve_tx_buffer_state *info; 611 uint32_t idx = tx->req & tx->mask; 612 struct ether_header *eh; 613 struct mbuf *mbuf_next; 614 int payload_iov = 2; 615 int bytes_required; 616 struct ip6_hdr *ip6; 617 struct tcphdr *th; 618 uint32_t next_idx; 619 uint8_t l3_off; 620 struct ip *ip; 621 int i; 622 623 info = &tx->info[idx]; 624 csum_flags = mbuf->m_pkthdr.csum_flags; 625 pkt_len = mbuf->m_pkthdr.len; 626 is_tso = csum_flags & CSUM_TSO; 627 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | 628 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); 629 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; 630 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; 631 632 eh = mtod(mbuf, struct ether_header *); 633 KASSERT(eh->ether_type != ETHERTYPE_VLAN, 634 ("VLAN-tagged packets not supported")); 635 636 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; 637 l3_off = ETHER_HDR_LEN; 638 mbuf_next = m_getptr(mbuf, l3_off, &offset); 639 640 if (is_ipv6) { 641 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); 642 l4_off = l3_off + sizeof(struct ip6_hdr); 643 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); 644 is_udp = (ip6->ip6_nxt == IPPROTO_UDP); 645 mbuf_next = m_getptr(mbuf, l4_off, &offset); 646 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { 647 ip = (struct ip *)(mtodo(mbuf_next, offset)); 648 l4_off = l3_off + (ip->ip_hl << 2); 649 is_tcp = (ip->ip_p == IPPROTO_TCP); 650 is_udp = (ip->ip_p == IPPROTO_UDP); 651 mbuf_next = m_getptr(mbuf, l4_off, &offset); 652 } 653 654 l4_data_off = 0; 655 if (is_tcp) { 656 th = (struct tcphdr *)(mtodo(mbuf_next, offset)); 657 l4_data_off = l4_off + (th->th_off << 2); 658 } else if (is_udp) 659 l4_data_off = l4_off + sizeof(struct udphdr); 660 661 if (has_csum_flag) { 662 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) 663 csum_offset = offsetof(struct tcphdr, th_sum); 664 else 665 csum_offset = offsetof(struct udphdr, uh_sum); 666 } 667 668 /* 669 * If this packet is neither a TCP nor a UDP packet, the first segment, 670 * the one represented by the packet descriptor, will carry the 671 * spec-stipulated minimum of 182B. 672 */ 673 if (l4_data_off != 0) 674 first_seg_len = l4_data_off; 675 else 676 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); 677 678 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); 679 if (__predict_false(!gve_can_tx(tx, bytes_required))) { 680 counter_enter(); 681 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); 682 counter_exit(); 683 return (ENOBUFS); 684 } 685 686 /* So that the cleanup taskqueue can free the mbuf eventually. */ 687 info->mbuf = mbuf; 688 689 /* 690 * We don't want to split the header, so if necessary, pad to the end 691 * of the fifo and then put the header at the beginning of the fifo. 692 */ 693 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); 694 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, 695 &info->iov[0]); 696 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); 697 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, 698 &info->iov[payload_iov]); 699 700 pkt_desc = &tx->desc_ring[idx].pkt; 701 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 702 1 + mtd_desc_nr + payload_nfrags, first_seg_len, 703 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, 704 pkt_len); 705 706 m_copydata(mbuf, 0, first_seg_len, 707 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); 708 gve_dma_sync_for_device(tx->com.qpl, 709 info->iov[hdr_nfrags - 1].iov_offset, 710 info->iov[hdr_nfrags - 1].iov_len); 711 copy_offset = first_seg_len; 712 713 if (mtd_desc_nr == 1) { 714 next_idx = (tx->req + 1) & tx->mask; 715 mtd_desc = &tx->desc_ring[next_idx].mtd; 716 gve_tx_fill_mtd_desc(mtd_desc, mbuf); 717 } 718 719 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { 720 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; 721 seg_desc = &tx->desc_ring[next_idx].seg; 722 723 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, 724 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); 725 726 m_copydata(mbuf, copy_offset, info->iov[i].iov_len, 727 (char *)tx->fifo.base + info->iov[i].iov_offset); 728 gve_dma_sync_for_device(tx->com.qpl, 729 info->iov[i].iov_offset, info->iov[i].iov_len); 730 copy_offset += info->iov[i].iov_len; 731 } 732 733 tx->req += (1 + mtd_desc_nr + payload_nfrags); 734 if (is_tso) { 735 counter_enter(); 736 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); 737 counter_exit(); 738 } 739 return (0); 740 } 741 742 static int 743 gve_xmit_mbuf(struct gve_tx_ring *tx, 744 struct mbuf **mbuf) 745 { 746 if (gve_is_gqi(tx->com.priv)) 747 return (gve_xmit(tx, *mbuf)); 748 749 if (gve_is_qpl(tx->com.priv)) 750 return (gve_xmit_dqo_qpl(tx, *mbuf)); 751 752 /* 753 * gve_xmit_dqo might attempt to defrag the mbuf chain. 754 * The reference is passed in so that in the case of 755 * errors, the new mbuf chain is what's put back on the br. 756 */ 757 return (gve_xmit_dqo(tx, mbuf)); 758 } 759 760 /* 761 * Has the side-effect of stopping the xmit queue by setting tx->stopped 762 */ 763 static int 764 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, 765 struct mbuf **mbuf) 766 { 767 int err; 768 769 atomic_store_bool(&tx->stopped, true); 770 771 /* 772 * Room made in the queue BEFORE the barrier will be seen by the 773 * gve_xmit_mbuf retry below. 774 * 775 * If room is made in the queue AFTER the barrier, the cleanup tq 776 * iteration creating the room will either see a tx->stopped value 777 * of 0 or the 1 we just wrote: 778 * 779 * If it sees a 1, then it would enqueue the xmit tq. Enqueue 780 * implies a retry on the waiting pkt. 781 * 782 * If it sees a 0, then that implies a previous iteration overwrote 783 * our 1, and that iteration would enqueue the xmit tq. Enqueue 784 * implies a retry on the waiting pkt. 785 */ 786 atomic_thread_fence_seq_cst(); 787 788 err = gve_xmit_mbuf(tx, mbuf); 789 if (err == 0) 790 atomic_store_bool(&tx->stopped, false); 791 792 return (err); 793 } 794 795 static void 796 gve_xmit_br(struct gve_tx_ring *tx) 797 { 798 struct gve_priv *priv = tx->com.priv; 799 struct ifnet *ifp = priv->ifp; 800 struct mbuf *mbuf; 801 int err; 802 803 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && 804 (mbuf = drbr_peek(ifp, tx->br)) != NULL) { 805 err = gve_xmit_mbuf(tx, &mbuf); 806 807 /* 808 * We need to stop this taskqueue when we can't xmit the pkt due 809 * to lack of space in the NIC ring (ENOBUFS). The retry exists 810 * to guard against a TOCTTOU bug that could end up freezing the 811 * queue forever. 812 */ 813 if (__predict_false(mbuf != NULL && err == ENOBUFS)) 814 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); 815 816 if (__predict_false(err != 0 && mbuf != NULL)) { 817 if (err == EINVAL) { 818 drbr_advance(ifp, tx->br); 819 m_freem(mbuf); 820 } else 821 drbr_putback(ifp, tx->br, mbuf); 822 break; 823 } 824 825 drbr_advance(ifp, tx->br); 826 BPF_MTAP(ifp, mbuf); 827 828 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, 829 BUS_DMASYNC_PREWRITE); 830 831 if (gve_is_gqi(priv)) 832 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); 833 else 834 gve_db_bar_dqo_write_4(priv, tx->com.db_offset, 835 tx->dqo.desc_tail); 836 } 837 } 838 839 void 840 gve_xmit_tq(void *arg, int pending) 841 { 842 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; 843 844 GVE_RING_LOCK(tx); 845 gve_xmit_br(tx); 846 GVE_RING_UNLOCK(tx); 847 } 848 849 static bool 850 is_vlan_tagged_pkt(struct mbuf *mbuf) 851 { 852 struct ether_header *eh; 853 854 eh = mtod(mbuf, struct ether_header *); 855 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); 856 } 857 858 int 859 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) 860 { 861 struct gve_priv *priv = if_getsoftc(ifp); 862 struct gve_tx_ring *tx; 863 bool is_br_empty; 864 int err; 865 uint32_t i; 866 867 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) 868 return (ENODEV); 869 870 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) 871 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; 872 else 873 i = curcpu % priv->tx_cfg.num_queues; 874 tx = &priv->tx[i]; 875 876 if (__predict_false(is_vlan_tagged_pkt(mbuf))) { 877 counter_enter(); 878 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); 879 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 880 counter_exit(); 881 m_freem(mbuf); 882 return (ENODEV); 883 } 884 885 is_br_empty = drbr_empty(ifp, tx->br); 886 err = drbr_enqueue(ifp, tx->br, mbuf); 887 if (__predict_false(err != 0)) { 888 if (!atomic_load_bool(&tx->stopped)) 889 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 890 counter_enter(); 891 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); 892 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); 893 counter_exit(); 894 return (err); 895 } 896 897 /* 898 * If the mbuf we just enqueued is the only one on the ring, then 899 * transmit it right away in the interests of low latency. 900 */ 901 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { 902 gve_xmit_br(tx); 903 GVE_RING_UNLOCK(tx); 904 } else if (!atomic_load_bool(&tx->stopped)) 905 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); 906 907 return (0); 908 } 909 910 void 911 gve_qflush(if_t ifp) 912 { 913 struct gve_priv *priv = if_getsoftc(ifp); 914 struct gve_tx_ring *tx; 915 int i; 916 917 for (i = 0; i < priv->tx_cfg.num_queues; ++i) { 918 tx = &priv->tx[i]; 919 if (drbr_empty(ifp, tx->br) == 0) { 920 GVE_RING_LOCK(tx); 921 drbr_flush(ifp, tx->br); 922 GVE_RING_UNLOCK(tx); 923 } 924 } 925 926 if_qflush(ifp); 927 } 928