1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include "gve_dqo.h" 11 #include <net/ip.h> 12 #include <linux/bpf.h> 13 #include <linux/tcp.h> 14 #include <linux/slab.h> 15 #include <linux/skbuff.h> 16 #include <net/xdp_sock_drv.h> 17 18 /* Returns true if tx_bufs are available. */ 19 static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count) 20 { 21 int num_avail; 22 23 if (!tx->dqo.qpl) 24 return true; 25 26 num_avail = tx->dqo.num_tx_qpl_bufs - 27 (tx->dqo_tx.alloc_tx_qpl_buf_cnt - 28 tx->dqo_tx.free_tx_qpl_buf_cnt); 29 30 if (count <= num_avail) 31 return true; 32 33 /* Update cached value from dqo_compl. */ 34 tx->dqo_tx.free_tx_qpl_buf_cnt = 35 atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_cnt); 36 37 num_avail = tx->dqo.num_tx_qpl_bufs - 38 (tx->dqo_tx.alloc_tx_qpl_buf_cnt - 39 tx->dqo_tx.free_tx_qpl_buf_cnt); 40 41 return count <= num_avail; 42 } 43 44 static s16 45 gve_alloc_tx_qpl_buf(struct gve_tx_ring *tx) 46 { 47 s16 index; 48 49 index = tx->dqo_tx.free_tx_qpl_buf_head; 50 51 /* No TX buffers available, try to steal the list from the 52 * completion handler. 53 */ 54 if (unlikely(index == -1)) { 55 tx->dqo_tx.free_tx_qpl_buf_head = 56 atomic_xchg(&tx->dqo_compl.free_tx_qpl_buf_head, -1); 57 index = tx->dqo_tx.free_tx_qpl_buf_head; 58 59 if (unlikely(index == -1)) 60 return index; 61 } 62 63 /* Remove TX buf from free list */ 64 tx->dqo_tx.free_tx_qpl_buf_head = tx->dqo.tx_qpl_buf_next[index]; 65 66 return index; 67 } 68 69 static void 70 gve_free_tx_qpl_bufs(struct gve_tx_ring *tx, 71 struct gve_tx_pending_packet_dqo *pkt) 72 { 73 s16 index; 74 int i; 75 76 if (!pkt->num_bufs) 77 return; 78 79 index = pkt->tx_qpl_buf_ids[0]; 80 /* Create a linked list of buffers to be added to the free list */ 81 for (i = 1; i < pkt->num_bufs; i++) { 82 tx->dqo.tx_qpl_buf_next[index] = pkt->tx_qpl_buf_ids[i]; 83 index = pkt->tx_qpl_buf_ids[i]; 84 } 85 86 while (true) { 87 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_head); 88 89 tx->dqo.tx_qpl_buf_next[index] = old_head; 90 if (atomic_cmpxchg(&tx->dqo_compl.free_tx_qpl_buf_head, 91 old_head, 92 pkt->tx_qpl_buf_ids[0]) == old_head) { 93 break; 94 } 95 } 96 97 atomic_add(pkt->num_bufs, &tx->dqo_compl.free_tx_qpl_buf_cnt); 98 pkt->num_bufs = 0; 99 } 100 101 /* Returns true if a gve_tx_pending_packet_dqo object is available. */ 102 static bool gve_has_pending_packet(struct gve_tx_ring *tx) 103 { 104 /* Check TX path's list. */ 105 if (tx->dqo_tx.free_pending_packets != -1) 106 return true; 107 108 /* Check completion handler's list. */ 109 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) 110 return true; 111 112 return false; 113 } 114 115 void gve_xdp_tx_flush_dqo(struct gve_priv *priv, u32 xdp_qid) 116 { 117 u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid); 118 struct gve_tx_ring *tx = &priv->tx[tx_qid]; 119 120 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 121 } 122 123 static struct gve_tx_pending_packet_dqo * 124 gve_alloc_pending_packet(struct gve_tx_ring *tx) 125 { 126 struct gve_tx_pending_packet_dqo *pending_packet; 127 s16 index; 128 129 index = tx->dqo_tx.free_pending_packets; 130 131 /* No pending_packets available, try to steal the list from the 132 * completion handler. 133 */ 134 if (unlikely(index == -1)) { 135 tx->dqo_tx.free_pending_packets = 136 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); 137 index = tx->dqo_tx.free_pending_packets; 138 139 if (unlikely(index == -1)) 140 return NULL; 141 } 142 143 pending_packet = &tx->dqo.pending_packets[index]; 144 145 /* Remove pending_packet from free list */ 146 tx->dqo_tx.free_pending_packets = pending_packet->next; 147 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; 148 149 return pending_packet; 150 } 151 152 static void 153 gve_free_pending_packet(struct gve_tx_ring *tx, 154 struct gve_tx_pending_packet_dqo *pending_packet) 155 { 156 s16 index = pending_packet - tx->dqo.pending_packets; 157 158 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; 159 while (true) { 160 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); 161 162 pending_packet->next = old_head; 163 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, 164 old_head, index) == old_head) { 165 break; 166 } 167 } 168 } 169 170 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. 171 */ 172 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) 173 { 174 int i; 175 176 for (i = 0; i < tx->dqo.num_pending_packets; i++) { 177 struct gve_tx_pending_packet_dqo *cur_state = 178 &tx->dqo.pending_packets[i]; 179 int j; 180 181 for (j = 0; j < cur_state->num_bufs; j++) { 182 if (j == 0) { 183 dma_unmap_single(tx->dev, 184 dma_unmap_addr(cur_state, dma[j]), 185 dma_unmap_len(cur_state, len[j]), 186 DMA_TO_DEVICE); 187 } else { 188 dma_unmap_page(tx->dev, 189 dma_unmap_addr(cur_state, dma[j]), 190 dma_unmap_len(cur_state, len[j]), 191 DMA_TO_DEVICE); 192 } 193 } 194 if (cur_state->skb) { 195 dev_consume_skb_any(cur_state->skb); 196 cur_state->skb = NULL; 197 } 198 } 199 } 200 201 void gve_tx_stop_ring_dqo(struct gve_priv *priv, int idx) 202 { 203 int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); 204 struct gve_tx_ring *tx = &priv->tx[idx]; 205 206 if (!gve_tx_was_added_to_block(priv, idx)) 207 return; 208 209 gve_remove_napi(priv, ntfy_idx); 210 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); 211 if (tx->netdev_txq) 212 netdev_tx_reset_queue(tx->netdev_txq); 213 gve_tx_clean_pending_packets(tx); 214 gve_tx_remove_from_block(priv, idx); 215 } 216 217 static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 218 struct gve_tx_alloc_rings_cfg *cfg) 219 { 220 struct device *hdev = &priv->pdev->dev; 221 int idx = tx->q_num; 222 size_t bytes; 223 u32 qpl_id; 224 225 if (tx->q_resources) { 226 dma_free_coherent(hdev, sizeof(*tx->q_resources), 227 tx->q_resources, tx->q_resources_bus); 228 tx->q_resources = NULL; 229 } 230 231 if (tx->dqo.compl_ring) { 232 bytes = sizeof(tx->dqo.compl_ring[0]) * 233 (tx->dqo.complq_mask + 1); 234 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, 235 tx->complq_bus_dqo); 236 tx->dqo.compl_ring = NULL; 237 } 238 239 if (tx->dqo.tx_ring) { 240 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 241 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); 242 tx->dqo.tx_ring = NULL; 243 } 244 245 kvfree(tx->dqo.xsk_reorder_queue); 246 tx->dqo.xsk_reorder_queue = NULL; 247 248 kvfree(tx->dqo.pending_packets); 249 tx->dqo.pending_packets = NULL; 250 251 kvfree(tx->dqo.tx_qpl_buf_next); 252 tx->dqo.tx_qpl_buf_next = NULL; 253 254 if (tx->dqo.qpl) { 255 qpl_id = gve_tx_qpl_id(priv, tx->q_num); 256 gve_free_queue_page_list(priv, tx->dqo.qpl, qpl_id); 257 tx->dqo.qpl = NULL; 258 } 259 260 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); 261 } 262 263 static int gve_tx_qpl_buf_init(struct gve_tx_ring *tx) 264 { 265 int num_tx_qpl_bufs = GVE_TX_BUFS_PER_PAGE_DQO * 266 tx->dqo.qpl->num_entries; 267 int i; 268 269 tx->dqo.tx_qpl_buf_next = kvcalloc(num_tx_qpl_bufs, 270 sizeof(tx->dqo.tx_qpl_buf_next[0]), 271 GFP_KERNEL); 272 if (!tx->dqo.tx_qpl_buf_next) 273 return -ENOMEM; 274 275 tx->dqo.num_tx_qpl_bufs = num_tx_qpl_bufs; 276 277 /* Generate free TX buf list */ 278 for (i = 0; i < num_tx_qpl_bufs - 1; i++) 279 tx->dqo.tx_qpl_buf_next[i] = i + 1; 280 tx->dqo.tx_qpl_buf_next[num_tx_qpl_bufs - 1] = -1; 281 282 atomic_set_release(&tx->dqo_compl.free_tx_qpl_buf_head, -1); 283 return 0; 284 } 285 286 void gve_tx_start_ring_dqo(struct gve_priv *priv, int idx) 287 { 288 int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); 289 struct gve_tx_ring *tx = &priv->tx[idx]; 290 291 gve_tx_add_to_block(priv, idx); 292 293 if (idx < priv->tx_cfg.num_queues) 294 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); 295 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo); 296 } 297 298 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, 299 struct gve_tx_alloc_rings_cfg *cfg, 300 struct gve_tx_ring *tx, 301 int idx) 302 { 303 struct device *hdev = &priv->pdev->dev; 304 int num_pending_packets; 305 int qpl_page_cnt; 306 size_t bytes; 307 u32 qpl_id; 308 int i; 309 310 memset(tx, 0, sizeof(*tx)); 311 tx->q_num = idx; 312 tx->dev = hdev; 313 spin_lock_init(&tx->dqo_tx.xdp_lock); 314 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); 315 316 /* Queue sizes must be a power of 2 */ 317 tx->mask = cfg->ring_size - 1; 318 tx->dqo.complq_mask = tx->mask; 319 320 /* The max number of pending packets determines the maximum number of 321 * descriptors which maybe written to the completion queue. 322 * 323 * We must set the number small enough to make sure we never overrun the 324 * completion queue. 325 */ 326 num_pending_packets = tx->dqo.complq_mask + 1; 327 328 /* Reserve space for descriptor completions, which will be reported at 329 * most every GVE_TX_MIN_RE_INTERVAL packets. 330 */ 331 num_pending_packets -= 332 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; 333 334 /* Each packet may have at most 2 buffer completions if it receives both 335 * a miss and reinjection completion. 336 */ 337 num_pending_packets /= 2; 338 339 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); 340 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, 341 sizeof(tx->dqo.pending_packets[0]), 342 GFP_KERNEL); 343 if (!tx->dqo.pending_packets) 344 goto err; 345 346 /* Set up linked list of pending packets */ 347 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) 348 tx->dqo.pending_packets[i].next = i + 1; 349 350 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; 351 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); 352 353 /* Only alloc xsk pool for XDP queues */ 354 if (idx >= cfg->qcfg->num_queues && cfg->num_xdp_rings) { 355 tx->dqo.xsk_reorder_queue = 356 kvcalloc(tx->dqo.complq_mask + 1, 357 sizeof(tx->dqo.xsk_reorder_queue[0]), 358 GFP_KERNEL); 359 if (!tx->dqo.xsk_reorder_queue) 360 goto err; 361 } 362 363 tx->dqo_compl.miss_completions.head = -1; 364 tx->dqo_compl.miss_completions.tail = -1; 365 tx->dqo_compl.timed_out_completions.head = -1; 366 tx->dqo_compl.timed_out_completions.tail = -1; 367 368 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 369 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); 370 if (!tx->dqo.tx_ring) 371 goto err; 372 373 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); 374 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, 375 &tx->complq_bus_dqo, 376 GFP_KERNEL); 377 if (!tx->dqo.compl_ring) 378 goto err; 379 380 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), 381 &tx->q_resources_bus, GFP_KERNEL); 382 if (!tx->q_resources) 383 goto err; 384 385 if (!cfg->raw_addressing) { 386 qpl_id = gve_tx_qpl_id(priv, tx->q_num); 387 qpl_page_cnt = priv->tx_pages_per_qpl; 388 389 tx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id, 390 qpl_page_cnt); 391 if (!tx->dqo.qpl) 392 goto err; 393 394 if (gve_tx_qpl_buf_init(tx)) 395 goto err; 396 } 397 398 return 0; 399 400 err: 401 gve_tx_free_ring_dqo(priv, tx, cfg); 402 return -ENOMEM; 403 } 404 405 int gve_tx_alloc_rings_dqo(struct gve_priv *priv, 406 struct gve_tx_alloc_rings_cfg *cfg) 407 { 408 struct gve_tx_ring *tx = cfg->tx; 409 int total_queues; 410 int err = 0; 411 int i, j; 412 413 total_queues = cfg->qcfg->num_queues + cfg->num_xdp_rings; 414 if (total_queues > cfg->qcfg->max_queues) { 415 netif_err(priv, drv, priv->dev, 416 "Cannot alloc more than the max num of Tx rings\n"); 417 return -EINVAL; 418 } 419 420 tx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_tx_ring), 421 GFP_KERNEL); 422 if (!tx) 423 return -ENOMEM; 424 425 for (i = 0; i < total_queues; i++) { 426 err = gve_tx_alloc_ring_dqo(priv, cfg, &tx[i], i); 427 if (err) { 428 netif_err(priv, drv, priv->dev, 429 "Failed to alloc tx ring=%d: err=%d\n", 430 i, err); 431 goto err; 432 } 433 } 434 435 cfg->tx = tx; 436 return 0; 437 438 err: 439 for (j = 0; j < i; j++) 440 gve_tx_free_ring_dqo(priv, &tx[j], cfg); 441 kvfree(tx); 442 return err; 443 } 444 445 void gve_tx_free_rings_dqo(struct gve_priv *priv, 446 struct gve_tx_alloc_rings_cfg *cfg) 447 { 448 struct gve_tx_ring *tx = cfg->tx; 449 int i; 450 451 if (!tx) 452 return; 453 454 for (i = 0; i < cfg->qcfg->num_queues + cfg->qcfg->num_xdp_queues; i++) 455 gve_tx_free_ring_dqo(priv, &tx[i], cfg); 456 457 kvfree(tx); 458 cfg->tx = NULL; 459 } 460 461 /* Returns the number of slots available in the ring */ 462 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) 463 { 464 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; 465 466 return tx->mask - num_used; 467 } 468 469 /* Checks if the requested number of slots are available in the ring */ 470 static bool gve_has_tx_slots_available(struct gve_tx_ring *tx, u32 slots_req) 471 { 472 u32 num_avail = num_avail_tx_slots(tx); 473 474 slots_req += GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP; 475 476 if (num_avail >= slots_req) 477 return true; 478 479 /* Update cached TX head pointer */ 480 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 481 482 return num_avail_tx_slots(tx) >= slots_req; 483 } 484 485 static bool gve_has_avail_slots_tx_dqo(struct gve_tx_ring *tx, 486 int desc_count, int buf_count) 487 { 488 return gve_has_pending_packet(tx) && 489 gve_has_tx_slots_available(tx, desc_count) && 490 gve_has_free_tx_qpl_bufs(tx, buf_count); 491 } 492 493 /* Stops the queue if available descriptors is less than 'count'. 494 * Return: 0 if stop is not required. 495 */ 496 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, 497 int desc_count, int buf_count) 498 { 499 if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) 500 return 0; 501 502 /* No space, so stop the queue */ 503 tx->stop_queue++; 504 netif_tx_stop_queue(tx->netdev_txq); 505 506 /* Sync with restarting queue in `gve_tx_poll_dqo()` */ 507 mb(); 508 509 /* After stopping queue, check if we can transmit again in order to 510 * avoid TOCTOU bug. 511 */ 512 if (likely(!gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) 513 return -EBUSY; 514 515 netif_tx_start_queue(tx->netdev_txq); 516 tx->wake_queue++; 517 return 0; 518 } 519 520 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, 521 struct gve_tx_metadata_dqo *metadata) 522 { 523 memset(metadata, 0, sizeof(*metadata)); 524 metadata->version = GVE_TX_METADATA_VERSION_DQO; 525 526 if (skb->l4_hash) { 527 u16 path_hash = skb->hash ^ (skb->hash >> 16); 528 529 path_hash &= (1 << 15) - 1; 530 if (unlikely(path_hash == 0)) 531 path_hash = ~path_hash; 532 533 metadata->path_hash = path_hash; 534 } 535 } 536 537 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, 538 bool enable_csum, u32 len, u64 addr, 539 s16 compl_tag, bool eop, bool is_gso) 540 { 541 while (len > 0) { 542 struct gve_tx_pkt_desc_dqo *desc = 543 &tx->dqo.tx_ring[*desc_idx].pkt; 544 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); 545 bool cur_eop = eop && cur_len == len; 546 547 *desc = (struct gve_tx_pkt_desc_dqo){ 548 .buf_addr = cpu_to_le64(addr), 549 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, 550 .end_of_packet = cur_eop, 551 .checksum_offload_enable = enable_csum, 552 .compl_tag = cpu_to_le16(compl_tag), 553 .buf_size = cur_len, 554 }; 555 556 addr += cur_len; 557 len -= cur_len; 558 *desc_idx = (*desc_idx + 1) & tx->mask; 559 } 560 } 561 562 /* Validates and prepares `skb` for TSO. 563 * 564 * Returns header length, or < 0 if invalid. 565 */ 566 static int gve_prep_tso(struct sk_buff *skb) 567 { 568 struct tcphdr *tcp; 569 int header_len; 570 u32 paylen; 571 int err; 572 573 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length 574 * of the TSO to be <= 262143. 575 * 576 * However, we don't validate these because: 577 * - Hypervisor enforces a limit of 9K MTU 578 * - Kernel will not produce a TSO larger than 64k 579 */ 580 581 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) 582 return -1; 583 584 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 585 return -EINVAL; 586 587 /* Needed because we will modify header. */ 588 err = skb_cow_head(skb, 0); 589 if (err < 0) 590 return err; 591 592 tcp = tcp_hdr(skb); 593 paylen = skb->len - skb_transport_offset(skb); 594 csum_replace_by_diff(&tcp->check, (__force __wsum)htonl(paylen)); 595 header_len = skb_tcp_all_headers(skb); 596 597 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) 598 return -EINVAL; 599 600 return header_len; 601 } 602 603 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, 604 const struct sk_buff *skb, 605 const struct gve_tx_metadata_dqo *metadata, 606 int header_len) 607 { 608 *desc = (struct gve_tx_tso_context_desc_dqo){ 609 .header_len = header_len, 610 .cmd_dtype = { 611 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, 612 .tso = 1, 613 }, 614 .flex0 = metadata->bytes[0], 615 .flex5 = metadata->bytes[5], 616 .flex6 = metadata->bytes[6], 617 .flex7 = metadata->bytes[7], 618 .flex8 = metadata->bytes[8], 619 .flex9 = metadata->bytes[9], 620 .flex10 = metadata->bytes[10], 621 .flex11 = metadata->bytes[11], 622 }; 623 desc->tso_total_len = skb->len - header_len; 624 desc->mss = skb_shinfo(skb)->gso_size; 625 } 626 627 static void 628 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, 629 const struct gve_tx_metadata_dqo *metadata) 630 { 631 *desc = (struct gve_tx_general_context_desc_dqo){ 632 .flex0 = metadata->bytes[0], 633 .flex1 = metadata->bytes[1], 634 .flex2 = metadata->bytes[2], 635 .flex3 = metadata->bytes[3], 636 .flex4 = metadata->bytes[4], 637 .flex5 = metadata->bytes[5], 638 .flex6 = metadata->bytes[6], 639 .flex7 = metadata->bytes[7], 640 .flex8 = metadata->bytes[8], 641 .flex9 = metadata->bytes[9], 642 .flex10 = metadata->bytes[10], 643 .flex11 = metadata->bytes[11], 644 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, 645 }; 646 } 647 648 static void gve_tx_update_tail(struct gve_tx_ring *tx, u32 desc_idx) 649 { 650 u32 last_desc_idx = (desc_idx - 1) & tx->mask; 651 u32 last_report_event_interval = 652 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; 653 654 /* Commit the changes to our state */ 655 tx->dqo_tx.tail = desc_idx; 656 657 /* Request a descriptor completion on the last descriptor of the 658 * packet if we are allowed to by the HW enforced interval. 659 */ 660 661 if (unlikely(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { 662 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; 663 tx->dqo_tx.last_re_idx = last_desc_idx; 664 } 665 } 666 667 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, 668 struct sk_buff *skb, 669 struct gve_tx_pending_packet_dqo *pkt, 670 s16 completion_tag, 671 u32 *desc_idx, 672 bool is_gso) 673 { 674 bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL; 675 const struct skb_shared_info *shinfo = skb_shinfo(skb); 676 int i; 677 678 /* Note: HW requires that the size of a non-TSO packet be within the 679 * range of [17, 9728]. 680 * 681 * We don't double check because 682 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. 683 * - Hypervisor won't allow MTU larger than 9216. 684 */ 685 686 pkt->num_bufs = 0; 687 /* Map the linear portion of skb */ 688 { 689 u32 len = skb_headlen(skb); 690 dma_addr_t addr; 691 692 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); 693 if (unlikely(dma_mapping_error(tx->dev, addr))) 694 goto err; 695 696 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 697 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 698 ++pkt->num_bufs; 699 700 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr, 701 completion_tag, 702 /*eop=*/shinfo->nr_frags == 0, is_gso); 703 } 704 705 for (i = 0; i < shinfo->nr_frags; i++) { 706 const skb_frag_t *frag = &shinfo->frags[i]; 707 bool is_eop = i == (shinfo->nr_frags - 1); 708 u32 len = skb_frag_size(frag); 709 dma_addr_t addr; 710 711 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); 712 if (unlikely(dma_mapping_error(tx->dev, addr))) 713 goto err; 714 715 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 716 netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt, 717 dma[pkt->num_bufs], addr); 718 ++pkt->num_bufs; 719 720 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr, 721 completion_tag, is_eop, is_gso); 722 } 723 724 return 0; 725 err: 726 for (i = 0; i < pkt->num_bufs; i++) { 727 if (i == 0) { 728 dma_unmap_single(tx->dev, 729 dma_unmap_addr(pkt, dma[i]), 730 dma_unmap_len(pkt, len[i]), 731 DMA_TO_DEVICE); 732 } else { 733 dma_unmap_page(tx->dev, 734 dma_unmap_addr(pkt, dma[i]), 735 dma_unmap_len(pkt, len[i]), 736 DMA_TO_DEVICE); 737 } 738 } 739 pkt->num_bufs = 0; 740 return -1; 741 } 742 743 /* Tx buffer i corresponds to 744 * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO 745 * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO 746 */ 747 static void gve_tx_buf_get_addr(struct gve_tx_ring *tx, 748 s16 index, 749 void **va, dma_addr_t *dma_addr) 750 { 751 int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); 752 int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; 753 754 *va = page_address(tx->dqo.qpl->pages[page_id]) + offset; 755 *dma_addr = tx->dqo.qpl->page_buses[page_id] + offset; 756 } 757 758 static int gve_tx_add_skb_copy_dqo(struct gve_tx_ring *tx, 759 struct sk_buff *skb, 760 struct gve_tx_pending_packet_dqo *pkt, 761 s16 completion_tag, 762 u32 *desc_idx, 763 bool is_gso) 764 { 765 bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL; 766 u32 copy_offset = 0; 767 dma_addr_t dma_addr; 768 u32 copy_len; 769 s16 index; 770 void *va; 771 772 /* Break the packet into buffer size chunks */ 773 pkt->num_bufs = 0; 774 while (copy_offset < skb->len) { 775 index = gve_alloc_tx_qpl_buf(tx); 776 if (unlikely(index == -1)) 777 goto err; 778 779 gve_tx_buf_get_addr(tx, index, &va, &dma_addr); 780 copy_len = min_t(u32, GVE_TX_BUF_SIZE_DQO, 781 skb->len - copy_offset); 782 skb_copy_bits(skb, copy_offset, va, copy_len); 783 784 copy_offset += copy_len; 785 dma_sync_single_for_device(tx->dev, dma_addr, 786 copy_len, DMA_TO_DEVICE); 787 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, 788 copy_len, 789 dma_addr, 790 completion_tag, 791 copy_offset == skb->len, 792 is_gso); 793 794 pkt->tx_qpl_buf_ids[pkt->num_bufs] = index; 795 ++tx->dqo_tx.alloc_tx_qpl_buf_cnt; 796 ++pkt->num_bufs; 797 } 798 799 return 0; 800 err: 801 /* Should not be here if gve_has_free_tx_qpl_bufs() check is correct */ 802 gve_free_tx_qpl_bufs(tx, pkt); 803 return -ENOMEM; 804 } 805 806 /* Returns 0 on success, or < 0 on error. 807 * 808 * Before this function is called, the caller must ensure 809 * gve_has_pending_packet(tx) returns true. 810 */ 811 static int gve_tx_add_skb_dqo(struct gve_tx_ring *tx, 812 struct sk_buff *skb) 813 { 814 const bool is_gso = skb_is_gso(skb); 815 u32 desc_idx = tx->dqo_tx.tail; 816 struct gve_tx_pending_packet_dqo *pkt; 817 struct gve_tx_metadata_dqo metadata; 818 s16 completion_tag; 819 820 pkt = gve_alloc_pending_packet(tx); 821 if (!pkt) 822 return -ENOMEM; 823 824 pkt->skb = skb; 825 pkt->type = GVE_TX_PENDING_PACKET_DQO_SKB; 826 completion_tag = pkt - tx->dqo.pending_packets; 827 828 gve_extract_tx_metadata_dqo(skb, &metadata); 829 if (is_gso) { 830 int header_len = gve_prep_tso(skb); 831 832 if (unlikely(header_len < 0)) 833 goto err; 834 835 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, 836 skb, &metadata, header_len); 837 desc_idx = (desc_idx + 1) & tx->mask; 838 } 839 840 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, 841 &metadata); 842 desc_idx = (desc_idx + 1) & tx->mask; 843 844 if (tx->dqo.qpl) { 845 if (gve_tx_add_skb_copy_dqo(tx, skb, pkt, 846 completion_tag, 847 &desc_idx, is_gso)) 848 goto err; 849 } else { 850 if (gve_tx_add_skb_no_copy_dqo(tx, skb, pkt, 851 completion_tag, 852 &desc_idx, is_gso)) 853 goto err; 854 } 855 856 tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs; 857 858 gve_tx_update_tail(tx, desc_idx); 859 return 0; 860 861 err: 862 pkt->skb = NULL; 863 gve_free_pending_packet(tx, pkt); 864 865 return -1; 866 } 867 868 static int gve_num_descs_per_buf(size_t size) 869 { 870 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); 871 } 872 873 static int gve_num_buffer_descs_needed(const struct sk_buff *skb) 874 { 875 const struct skb_shared_info *shinfo = skb_shinfo(skb); 876 int num_descs; 877 int i; 878 879 num_descs = gve_num_descs_per_buf(skb_headlen(skb)); 880 881 for (i = 0; i < shinfo->nr_frags; i++) { 882 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); 883 884 num_descs += gve_num_descs_per_buf(frag_size); 885 } 886 887 return num_descs; 888 } 889 890 /* Returns true if HW is capable of sending TSO represented by `skb`. 891 * 892 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. 893 * - The header is counted as one buffer for every single segment. 894 * - A buffer which is split between two segments is counted for both. 895 * - If a buffer contains both header and payload, it is counted as two buffers. 896 */ 897 static bool gve_can_send_tso(const struct sk_buff *skb) 898 { 899 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; 900 const struct skb_shared_info *shinfo = skb_shinfo(skb); 901 const int header_len = skb_tcp_all_headers(skb); 902 const int gso_size = shinfo->gso_size; 903 int cur_seg_num_bufs; 904 int prev_frag_size; 905 int cur_seg_size; 906 int i; 907 908 cur_seg_size = skb_headlen(skb) - header_len; 909 prev_frag_size = skb_headlen(skb); 910 cur_seg_num_bufs = cur_seg_size > 0; 911 912 for (i = 0; i < shinfo->nr_frags; i++) { 913 if (cur_seg_size >= gso_size) { 914 cur_seg_size %= gso_size; 915 cur_seg_num_bufs = cur_seg_size > 0; 916 917 if (prev_frag_size > GVE_TX_MAX_BUF_SIZE_DQO) { 918 int prev_frag_remain = prev_frag_size % 919 GVE_TX_MAX_BUF_SIZE_DQO; 920 921 /* If the last descriptor of the previous frag 922 * is less than cur_seg_size, the segment will 923 * span two descriptors in the previous frag. 924 * Since max gso size (9728) is less than 925 * GVE_TX_MAX_BUF_SIZE_DQO, it is impossible 926 * for the segment to span more than two 927 * descriptors. 928 */ 929 if (prev_frag_remain && 930 cur_seg_size > prev_frag_remain) 931 cur_seg_num_bufs++; 932 } 933 } 934 935 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) 936 return false; 937 938 prev_frag_size = skb_frag_size(&shinfo->frags[i]); 939 cur_seg_size += prev_frag_size; 940 } 941 942 return true; 943 } 944 945 netdev_features_t gve_features_check_dqo(struct sk_buff *skb, 946 struct net_device *dev, 947 netdev_features_t features) 948 { 949 if (skb_is_gso(skb) && !gve_can_send_tso(skb)) 950 return features & ~NETIF_F_GSO_MASK; 951 952 return features; 953 } 954 955 /* Attempt to transmit specified SKB. 956 * 957 * Returns 0 if the SKB was transmitted or dropped. 958 * Returns -1 if there is not currently enough space to transmit the SKB. 959 */ 960 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, 961 struct sk_buff *skb) 962 { 963 int num_buffer_descs; 964 int total_num_descs; 965 966 if (skb_is_gso(skb) && unlikely(ipv6_hopopt_jumbo_remove(skb))) 967 goto drop; 968 969 if (tx->dqo.qpl) { 970 /* We do not need to verify the number of buffers used per 971 * packet or per segment in case of TSO as with 2K size buffers 972 * none of the TX packet rules would be violated. 973 * 974 * gve_can_send_tso() checks that each TCP segment of gso_size is 975 * not distributed over more than 9 SKB frags.. 976 */ 977 num_buffer_descs = DIV_ROUND_UP(skb->len, GVE_TX_BUF_SIZE_DQO); 978 } else { 979 num_buffer_descs = gve_num_buffer_descs_needed(skb); 980 if (!skb_is_gso(skb)) { 981 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { 982 if (unlikely(skb_linearize(skb) < 0)) 983 goto drop; 984 985 num_buffer_descs = 1; 986 } 987 } 988 } 989 990 /* Metadata + (optional TSO) + data descriptors. */ 991 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; 992 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs, 993 num_buffer_descs))) { 994 return -1; 995 } 996 997 if (unlikely(gve_tx_add_skb_dqo(tx, skb) < 0)) 998 goto drop; 999 1000 netdev_tx_sent_queue(tx->netdev_txq, skb->len); 1001 skb_tx_timestamp(skb); 1002 return 0; 1003 1004 drop: 1005 u64_stats_update_begin(&tx->statss); 1006 tx->dropped_pkt++; 1007 u64_stats_update_end(&tx->statss); 1008 dev_kfree_skb_any(skb); 1009 return 0; 1010 } 1011 1012 static void gve_xsk_reorder_queue_push_dqo(struct gve_tx_ring *tx, 1013 u16 completion_tag) 1014 { 1015 u32 tail = atomic_read(&tx->dqo_tx.xsk_reorder_queue_tail); 1016 1017 tx->dqo.xsk_reorder_queue[tail] = completion_tag; 1018 tail = (tail + 1) & tx->dqo.complq_mask; 1019 atomic_set_release(&tx->dqo_tx.xsk_reorder_queue_tail, tail); 1020 } 1021 1022 static struct gve_tx_pending_packet_dqo * 1023 gve_xsk_reorder_queue_head(struct gve_tx_ring *tx) 1024 { 1025 u32 head = tx->dqo_compl.xsk_reorder_queue_head; 1026 1027 if (head == tx->dqo_compl.xsk_reorder_queue_tail) { 1028 tx->dqo_compl.xsk_reorder_queue_tail = 1029 atomic_read_acquire(&tx->dqo_tx.xsk_reorder_queue_tail); 1030 1031 if (head == tx->dqo_compl.xsk_reorder_queue_tail) 1032 return NULL; 1033 } 1034 1035 return &tx->dqo.pending_packets[tx->dqo.xsk_reorder_queue[head]]; 1036 } 1037 1038 static void gve_xsk_reorder_queue_pop_dqo(struct gve_tx_ring *tx) 1039 { 1040 tx->dqo_compl.xsk_reorder_queue_head++; 1041 tx->dqo_compl.xsk_reorder_queue_head &= tx->dqo.complq_mask; 1042 } 1043 1044 /* Transmit a given skb and ring the doorbell. */ 1045 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) 1046 { 1047 struct gve_priv *priv = netdev_priv(dev); 1048 struct gve_tx_ring *tx; 1049 1050 tx = &priv->tx[skb_get_queue_mapping(skb)]; 1051 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { 1052 /* We need to ring the txq doorbell -- we have stopped the Tx 1053 * queue for want of resources, but prior calls to gve_tx() 1054 * may have added descriptors without ringing the doorbell. 1055 */ 1056 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1057 return NETDEV_TX_BUSY; 1058 } 1059 1060 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) 1061 return NETDEV_TX_OK; 1062 1063 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1064 return NETDEV_TX_OK; 1065 } 1066 1067 static bool gve_xsk_tx_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1068 int budget) 1069 { 1070 struct xsk_buff_pool *pool = tx->xsk_pool; 1071 struct xdp_desc desc; 1072 bool repoll = false; 1073 int sent = 0; 1074 1075 spin_lock(&tx->dqo_tx.xdp_lock); 1076 for (; sent < budget; sent++) { 1077 struct gve_tx_pending_packet_dqo *pkt; 1078 s16 completion_tag; 1079 dma_addr_t addr; 1080 u32 desc_idx; 1081 1082 if (unlikely(!gve_has_avail_slots_tx_dqo(tx, 1, 1))) { 1083 repoll = true; 1084 break; 1085 } 1086 1087 if (!xsk_tx_peek_desc(pool, &desc)) 1088 break; 1089 1090 pkt = gve_alloc_pending_packet(tx); 1091 pkt->type = GVE_TX_PENDING_PACKET_DQO_XSK; 1092 pkt->num_bufs = 0; 1093 completion_tag = pkt - tx->dqo.pending_packets; 1094 1095 addr = xsk_buff_raw_get_dma(pool, desc.addr); 1096 xsk_buff_raw_dma_sync_for_device(pool, addr, desc.len); 1097 1098 desc_idx = tx->dqo_tx.tail; 1099 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, 1100 true, desc.len, 1101 addr, completion_tag, true, 1102 false); 1103 ++pkt->num_bufs; 1104 gve_tx_update_tail(tx, desc_idx); 1105 tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs; 1106 gve_xsk_reorder_queue_push_dqo(tx, completion_tag); 1107 } 1108 1109 if (sent) { 1110 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1111 xsk_tx_release(pool); 1112 } 1113 1114 spin_unlock(&tx->dqo_tx.xdp_lock); 1115 1116 u64_stats_update_begin(&tx->statss); 1117 tx->xdp_xsk_sent += sent; 1118 u64_stats_update_end(&tx->statss); 1119 1120 return (sent == budget) || repoll; 1121 } 1122 1123 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, 1124 struct gve_tx_pending_packet_dqo *pending_packet) 1125 { 1126 s16 old_tail, index; 1127 1128 index = pending_packet - tx->dqo.pending_packets; 1129 old_tail = list->tail; 1130 list->tail = index; 1131 if (old_tail == -1) 1132 list->head = index; 1133 else 1134 tx->dqo.pending_packets[old_tail].next = index; 1135 1136 pending_packet->next = -1; 1137 pending_packet->prev = old_tail; 1138 } 1139 1140 static void remove_from_list(struct gve_tx_ring *tx, 1141 struct gve_index_list *list, 1142 struct gve_tx_pending_packet_dqo *pkt) 1143 { 1144 s16 prev_index, next_index; 1145 1146 prev_index = pkt->prev; 1147 next_index = pkt->next; 1148 1149 if (prev_index == -1) { 1150 /* Node is head */ 1151 list->head = next_index; 1152 } else { 1153 tx->dqo.pending_packets[prev_index].next = next_index; 1154 } 1155 if (next_index == -1) { 1156 /* Node is tail */ 1157 list->tail = prev_index; 1158 } else { 1159 tx->dqo.pending_packets[next_index].prev = prev_index; 1160 } 1161 } 1162 1163 static void gve_unmap_packet(struct device *dev, 1164 struct gve_tx_pending_packet_dqo *pkt) 1165 { 1166 int i; 1167 1168 /* SKB linear portion is guaranteed to be mapped */ 1169 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), 1170 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); 1171 for (i = 1; i < pkt->num_bufs; i++) { 1172 netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]), 1173 dma_unmap_len(pkt, len[i]), 1174 DMA_TO_DEVICE, 0); 1175 } 1176 pkt->num_bufs = 0; 1177 } 1178 1179 /* Completion types and expected behavior: 1180 * No Miss compl + Packet compl = Packet completed normally. 1181 * Miss compl + Re-inject compl = Packet completed normally. 1182 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. 1183 * Miss compl + Packet compl = Skipped i.e. packet not completed. 1184 */ 1185 static void gve_handle_packet_completion(struct gve_priv *priv, 1186 struct gve_tx_ring *tx, bool is_napi, 1187 u16 compl_tag, u64 *bytes, u64 *pkts, 1188 bool is_reinjection) 1189 { 1190 struct gve_tx_pending_packet_dqo *pending_packet; 1191 1192 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 1193 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 1194 priv->dev->name, (int)compl_tag); 1195 return; 1196 } 1197 1198 pending_packet = &tx->dqo.pending_packets[compl_tag]; 1199 1200 if (unlikely(is_reinjection)) { 1201 if (unlikely(pending_packet->state == 1202 GVE_PACKET_STATE_TIMED_OUT_COMPL)) { 1203 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", 1204 priv->dev->name, (int)compl_tag); 1205 /* Packet was already completed as a result of timeout, 1206 * so just remove from list and free pending packet. 1207 */ 1208 remove_from_list(tx, 1209 &tx->dqo_compl.timed_out_completions, 1210 pending_packet); 1211 gve_free_pending_packet(tx, pending_packet); 1212 return; 1213 } 1214 if (unlikely(pending_packet->state != 1215 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { 1216 /* No outstanding miss completion but packet allocated 1217 * implies packet receives a re-injection completion 1218 * without a prior miss completion. Return without 1219 * completing the packet. 1220 */ 1221 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", 1222 priv->dev->name, (int)compl_tag); 1223 return; 1224 } 1225 remove_from_list(tx, &tx->dqo_compl.miss_completions, 1226 pending_packet); 1227 } else { 1228 /* Packet is allocated but not a pending data completion. */ 1229 if (unlikely(pending_packet->state != 1230 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 1231 net_err_ratelimited("%s: No pending data completion: %d\n", 1232 priv->dev->name, (int)compl_tag); 1233 return; 1234 } 1235 } 1236 tx->dqo_tx.completed_packet_desc_cnt += pending_packet->num_bufs; 1237 1238 switch (pending_packet->type) { 1239 case GVE_TX_PENDING_PACKET_DQO_SKB: 1240 if (tx->dqo.qpl) 1241 gve_free_tx_qpl_bufs(tx, pending_packet); 1242 else 1243 gve_unmap_packet(tx->dev, pending_packet); 1244 (*pkts)++; 1245 *bytes += pending_packet->skb->len; 1246 1247 napi_consume_skb(pending_packet->skb, is_napi); 1248 pending_packet->skb = NULL; 1249 gve_free_pending_packet(tx, pending_packet); 1250 break; 1251 case GVE_TX_PENDING_PACKET_DQO_XDP_FRAME: 1252 gve_unmap_packet(tx->dev, pending_packet); 1253 (*pkts)++; 1254 *bytes += pending_packet->xdpf->len; 1255 1256 xdp_return_frame(pending_packet->xdpf); 1257 pending_packet->xdpf = NULL; 1258 gve_free_pending_packet(tx, pending_packet); 1259 break; 1260 case GVE_TX_PENDING_PACKET_DQO_XSK: 1261 pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE; 1262 break; 1263 default: 1264 WARN_ON_ONCE(1); 1265 } 1266 } 1267 1268 static void gve_handle_miss_completion(struct gve_priv *priv, 1269 struct gve_tx_ring *tx, u16 compl_tag, 1270 u64 *bytes, u64 *pkts) 1271 { 1272 struct gve_tx_pending_packet_dqo *pending_packet; 1273 1274 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 1275 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 1276 priv->dev->name, (int)compl_tag); 1277 return; 1278 } 1279 1280 pending_packet = &tx->dqo.pending_packets[compl_tag]; 1281 if (unlikely(pending_packet->state != 1282 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 1283 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", 1284 priv->dev->name, (int)pending_packet->state, 1285 (int)compl_tag); 1286 return; 1287 } 1288 1289 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; 1290 /* jiffies can wraparound but time comparisons can handle overflows. */ 1291 pending_packet->timeout_jiffies = 1292 jiffies + 1293 secs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT); 1294 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); 1295 1296 *bytes += pending_packet->skb->len; 1297 (*pkts)++; 1298 } 1299 1300 static void remove_miss_completions(struct gve_priv *priv, 1301 struct gve_tx_ring *tx) 1302 { 1303 struct gve_tx_pending_packet_dqo *pending_packet; 1304 s16 next_index; 1305 1306 next_index = tx->dqo_compl.miss_completions.head; 1307 while (next_index != -1) { 1308 pending_packet = &tx->dqo.pending_packets[next_index]; 1309 next_index = pending_packet->next; 1310 /* Break early because packets should timeout in order. */ 1311 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 1312 break; 1313 1314 remove_from_list(tx, &tx->dqo_compl.miss_completions, 1315 pending_packet); 1316 /* Unmap/free TX buffers and free skb but do not unallocate packet i.e. 1317 * the completion tag is not freed to ensure that the driver 1318 * can take appropriate action if a corresponding valid 1319 * completion is received later. 1320 */ 1321 if (tx->dqo.qpl) 1322 gve_free_tx_qpl_bufs(tx, pending_packet); 1323 else 1324 gve_unmap_packet(tx->dev, pending_packet); 1325 1326 /* This indicates the packet was dropped. */ 1327 dev_kfree_skb_any(pending_packet->skb); 1328 pending_packet->skb = NULL; 1329 1330 u64_stats_update_begin(&tx->statss); 1331 tx->dropped_pkt++; 1332 u64_stats_update_end(&tx->statss); 1333 1334 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", 1335 priv->dev->name, 1336 (int)(pending_packet - tx->dqo.pending_packets)); 1337 1338 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; 1339 pending_packet->timeout_jiffies = 1340 jiffies + 1341 secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT); 1342 /* Maintain pending packet in another list so the packet can be 1343 * unallocated at a later time. 1344 */ 1345 add_to_list(tx, &tx->dqo_compl.timed_out_completions, 1346 pending_packet); 1347 } 1348 } 1349 1350 static void remove_timed_out_completions(struct gve_priv *priv, 1351 struct gve_tx_ring *tx) 1352 { 1353 struct gve_tx_pending_packet_dqo *pending_packet; 1354 s16 next_index; 1355 1356 next_index = tx->dqo_compl.timed_out_completions.head; 1357 while (next_index != -1) { 1358 pending_packet = &tx->dqo.pending_packets[next_index]; 1359 next_index = pending_packet->next; 1360 /* Break early because packets should timeout in order. */ 1361 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 1362 break; 1363 1364 remove_from_list(tx, &tx->dqo_compl.timed_out_completions, 1365 pending_packet); 1366 1367 /* Need to count XSK packets in xsk_tx_completed. */ 1368 if (pending_packet->type == GVE_TX_PENDING_PACKET_DQO_XSK) 1369 pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE; 1370 else 1371 gve_free_pending_packet(tx, pending_packet); 1372 } 1373 } 1374 1375 static void gve_tx_process_xsk_completions(struct gve_tx_ring *tx) 1376 { 1377 u32 num_xsks = 0; 1378 1379 while (true) { 1380 struct gve_tx_pending_packet_dqo *pending_packet = 1381 gve_xsk_reorder_queue_head(tx); 1382 1383 if (!pending_packet || 1384 pending_packet->state != GVE_PACKET_STATE_XSK_COMPLETE) 1385 break; 1386 1387 num_xsks++; 1388 gve_xsk_reorder_queue_pop_dqo(tx); 1389 gve_free_pending_packet(tx, pending_packet); 1390 } 1391 1392 if (num_xsks) 1393 xsk_tx_completed(tx->xsk_pool, num_xsks); 1394 } 1395 1396 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1397 struct napi_struct *napi) 1398 { 1399 u64 reinject_compl_bytes = 0; 1400 u64 reinject_compl_pkts = 0; 1401 int num_descs_cleaned = 0; 1402 u64 miss_compl_bytes = 0; 1403 u64 miss_compl_pkts = 0; 1404 u64 pkt_compl_bytes = 0; 1405 u64 pkt_compl_pkts = 0; 1406 1407 /* Limit in order to avoid blocking for too long */ 1408 while (!napi || pkt_compl_pkts < napi->weight) { 1409 struct gve_tx_compl_desc *compl_desc = 1410 &tx->dqo.compl_ring[tx->dqo_compl.head]; 1411 u16 type; 1412 1413 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) 1414 break; 1415 1416 /* Prefetch the next descriptor. */ 1417 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & 1418 tx->dqo.complq_mask]); 1419 1420 /* Do not read data until we own the descriptor */ 1421 dma_rmb(); 1422 type = compl_desc->type; 1423 1424 if (type == GVE_COMPL_TYPE_DQO_DESC) { 1425 /* This is the last descriptor fetched by HW plus one */ 1426 u16 tx_head = le16_to_cpu(compl_desc->tx_head); 1427 1428 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); 1429 } else if (type == GVE_COMPL_TYPE_DQO_PKT) { 1430 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1431 if (compl_tag & GVE_ALT_MISS_COMPL_BIT) { 1432 compl_tag &= ~GVE_ALT_MISS_COMPL_BIT; 1433 gve_handle_miss_completion(priv, tx, compl_tag, 1434 &miss_compl_bytes, 1435 &miss_compl_pkts); 1436 } else { 1437 gve_handle_packet_completion(priv, tx, !!napi, 1438 compl_tag, 1439 &pkt_compl_bytes, 1440 &pkt_compl_pkts, 1441 false); 1442 } 1443 } else if (type == GVE_COMPL_TYPE_DQO_MISS) { 1444 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1445 1446 gve_handle_miss_completion(priv, tx, compl_tag, 1447 &miss_compl_bytes, 1448 &miss_compl_pkts); 1449 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { 1450 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1451 1452 gve_handle_packet_completion(priv, tx, !!napi, 1453 compl_tag, 1454 &reinject_compl_bytes, 1455 &reinject_compl_pkts, 1456 true); 1457 } 1458 1459 tx->dqo_compl.head = 1460 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; 1461 /* Flip the generation bit when we wrap around */ 1462 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; 1463 num_descs_cleaned++; 1464 } 1465 1466 if (tx->netdev_txq) 1467 netdev_tx_completed_queue(tx->netdev_txq, 1468 pkt_compl_pkts + miss_compl_pkts, 1469 pkt_compl_bytes + miss_compl_bytes); 1470 1471 remove_miss_completions(priv, tx); 1472 remove_timed_out_completions(priv, tx); 1473 1474 if (tx->xsk_pool) 1475 gve_tx_process_xsk_completions(tx); 1476 1477 u64_stats_update_begin(&tx->statss); 1478 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; 1479 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; 1480 u64_stats_update_end(&tx->statss); 1481 return num_descs_cleaned; 1482 } 1483 1484 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) 1485 { 1486 struct gve_tx_compl_desc *compl_desc; 1487 struct gve_tx_ring *tx = block->tx; 1488 struct gve_priv *priv = block->priv; 1489 1490 if (do_clean) { 1491 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, 1492 &block->napi); 1493 1494 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ 1495 mb(); 1496 1497 if (netif_tx_queue_stopped(tx->netdev_txq) && 1498 num_descs_cleaned > 0) { 1499 tx->wake_queue++; 1500 netif_tx_wake_queue(tx->netdev_txq); 1501 } 1502 } 1503 1504 /* Return true if we still have work. */ 1505 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1506 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1507 } 1508 1509 bool gve_xsk_tx_poll_dqo(struct gve_notify_block *rx_block, int budget) 1510 { 1511 struct gve_rx_ring *rx = rx_block->rx; 1512 struct gve_priv *priv = rx->gve; 1513 struct gve_tx_ring *tx; 1514 1515 tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; 1516 if (tx->xsk_pool) 1517 return gve_xsk_tx_dqo(priv, tx, budget); 1518 1519 return 0; 1520 } 1521 1522 bool gve_xdp_poll_dqo(struct gve_notify_block *block) 1523 { 1524 struct gve_tx_compl_desc *compl_desc; 1525 struct gve_tx_ring *tx = block->tx; 1526 struct gve_priv *priv = block->priv; 1527 1528 gve_clean_tx_done_dqo(priv, tx, &block->napi); 1529 1530 /* Return true if we still have work. */ 1531 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1532 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1533 } 1534 1535 int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1536 struct xdp_frame *xdpf) 1537 { 1538 struct gve_tx_pending_packet_dqo *pkt; 1539 u32 desc_idx = tx->dqo_tx.tail; 1540 s16 completion_tag; 1541 int num_descs = 1; 1542 dma_addr_t addr; 1543 int err; 1544 1545 if (unlikely(!gve_has_tx_slots_available(tx, num_descs))) 1546 return -EBUSY; 1547 1548 pkt = gve_alloc_pending_packet(tx); 1549 if (unlikely(!pkt)) 1550 return -EBUSY; 1551 1552 pkt->type = GVE_TX_PENDING_PACKET_DQO_XDP_FRAME; 1553 pkt->num_bufs = 0; 1554 pkt->xdpf = xdpf; 1555 completion_tag = pkt - tx->dqo.pending_packets; 1556 1557 /* Generate Packet Descriptor */ 1558 addr = dma_map_single(tx->dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); 1559 err = dma_mapping_error(tx->dev, addr); 1560 if (unlikely(err)) 1561 goto err; 1562 1563 dma_unmap_len_set(pkt, len[pkt->num_bufs], xdpf->len); 1564 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 1565 pkt->num_bufs++; 1566 1567 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, 1568 false, xdpf->len, 1569 addr, completion_tag, true, 1570 false); 1571 1572 gve_tx_update_tail(tx, desc_idx); 1573 return 0; 1574 1575 err: 1576 pkt->xdpf = NULL; 1577 pkt->num_bufs = 0; 1578 gve_free_pending_packet(tx, pkt); 1579 return err; 1580 } 1581 1582 int gve_xdp_xmit_dqo(struct net_device *dev, int n, struct xdp_frame **frames, 1583 u32 flags) 1584 { 1585 struct gve_priv *priv = netdev_priv(dev); 1586 struct gve_tx_ring *tx; 1587 int i, err = 0, qid; 1588 1589 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 1590 return -EINVAL; 1591 1592 qid = gve_xdp_tx_queue_id(priv, 1593 smp_processor_id() % priv->tx_cfg.num_xdp_queues); 1594 1595 tx = &priv->tx[qid]; 1596 1597 spin_lock(&tx->dqo_tx.xdp_lock); 1598 for (i = 0; i < n; i++) { 1599 err = gve_xdp_xmit_one_dqo(priv, tx, frames[i]); 1600 if (err) 1601 break; 1602 } 1603 1604 if (flags & XDP_XMIT_FLUSH) 1605 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1606 1607 spin_unlock(&tx->dqo_tx.xdp_lock); 1608 1609 u64_stats_update_begin(&tx->statss); 1610 tx->xdp_xmit += n; 1611 tx->xdp_xmit_errors += n - i; 1612 u64_stats_update_end(&tx->statss); 1613 1614 return i ? i : err; 1615 } 1616