1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include "gve_dqo.h" 11 #include <net/ip.h> 12 #include <linux/bpf.h> 13 #include <linux/tcp.h> 14 #include <linux/slab.h> 15 #include <linux/skbuff.h> 16 #include <net/xdp_sock_drv.h> 17 18 /* Returns true if tx_bufs are available. */ 19 static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count) 20 { 21 int num_avail; 22 23 if (!tx->dqo.qpl) 24 return true; 25 26 num_avail = tx->dqo.num_tx_qpl_bufs - 27 (tx->dqo_tx.alloc_tx_qpl_buf_cnt - 28 tx->dqo_tx.free_tx_qpl_buf_cnt); 29 30 if (count <= num_avail) 31 return true; 32 33 /* Update cached value from dqo_compl. */ 34 tx->dqo_tx.free_tx_qpl_buf_cnt = 35 atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_cnt); 36 37 num_avail = tx->dqo.num_tx_qpl_bufs - 38 (tx->dqo_tx.alloc_tx_qpl_buf_cnt - 39 tx->dqo_tx.free_tx_qpl_buf_cnt); 40 41 return count <= num_avail; 42 } 43 44 static s16 45 gve_alloc_tx_qpl_buf(struct gve_tx_ring *tx) 46 { 47 s16 index; 48 49 index = tx->dqo_tx.free_tx_qpl_buf_head; 50 51 /* No TX buffers available, try to steal the list from the 52 * completion handler. 53 */ 54 if (unlikely(index == -1)) { 55 tx->dqo_tx.free_tx_qpl_buf_head = 56 atomic_xchg(&tx->dqo_compl.free_tx_qpl_buf_head, -1); 57 index = tx->dqo_tx.free_tx_qpl_buf_head; 58 59 if (unlikely(index == -1)) 60 return index; 61 } 62 63 /* Remove TX buf from free list */ 64 tx->dqo_tx.free_tx_qpl_buf_head = tx->dqo.tx_qpl_buf_next[index]; 65 66 return index; 67 } 68 69 static void 70 gve_free_tx_qpl_bufs(struct gve_tx_ring *tx, 71 struct gve_tx_pending_packet_dqo *pkt) 72 { 73 s16 index; 74 int i; 75 76 if (!pkt->num_bufs) 77 return; 78 79 index = pkt->tx_qpl_buf_ids[0]; 80 /* Create a linked list of buffers to be added to the free list */ 81 for (i = 1; i < pkt->num_bufs; i++) { 82 tx->dqo.tx_qpl_buf_next[index] = pkt->tx_qpl_buf_ids[i]; 83 index = pkt->tx_qpl_buf_ids[i]; 84 } 85 86 while (true) { 87 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_head); 88 89 tx->dqo.tx_qpl_buf_next[index] = old_head; 90 if (atomic_cmpxchg(&tx->dqo_compl.free_tx_qpl_buf_head, 91 old_head, 92 pkt->tx_qpl_buf_ids[0]) == old_head) { 93 break; 94 } 95 } 96 97 atomic_add(pkt->num_bufs, &tx->dqo_compl.free_tx_qpl_buf_cnt); 98 pkt->num_bufs = 0; 99 } 100 101 /* Returns true if a gve_tx_pending_packet_dqo object is available. */ 102 static bool gve_has_pending_packet(struct gve_tx_ring *tx) 103 { 104 /* Check TX path's list. */ 105 if (tx->dqo_tx.free_pending_packets != -1) 106 return true; 107 108 /* Check completion handler's list. */ 109 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) 110 return true; 111 112 return false; 113 } 114 115 void gve_xdp_tx_flush_dqo(struct gve_priv *priv, u32 xdp_qid) 116 { 117 u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid); 118 struct gve_tx_ring *tx = &priv->tx[tx_qid]; 119 120 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 121 } 122 123 static struct gve_tx_pending_packet_dqo * 124 gve_alloc_pending_packet(struct gve_tx_ring *tx) 125 { 126 struct gve_tx_pending_packet_dqo *pending_packet; 127 s16 index; 128 129 index = tx->dqo_tx.free_pending_packets; 130 131 /* No pending_packets available, try to steal the list from the 132 * completion handler. 133 */ 134 if (unlikely(index == -1)) { 135 tx->dqo_tx.free_pending_packets = 136 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); 137 index = tx->dqo_tx.free_pending_packets; 138 139 if (unlikely(index == -1)) 140 return NULL; 141 } 142 143 pending_packet = &tx->dqo.pending_packets[index]; 144 145 /* Remove pending_packet from free list */ 146 tx->dqo_tx.free_pending_packets = pending_packet->next; 147 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; 148 149 return pending_packet; 150 } 151 152 static void 153 gve_free_pending_packet(struct gve_tx_ring *tx, 154 struct gve_tx_pending_packet_dqo *pending_packet) 155 { 156 s16 index = pending_packet - tx->dqo.pending_packets; 157 158 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; 159 while (true) { 160 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); 161 162 pending_packet->next = old_head; 163 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, 164 old_head, index) == old_head) { 165 break; 166 } 167 } 168 } 169 170 static void gve_unmap_packet(struct device *dev, 171 struct gve_tx_pending_packet_dqo *pkt) 172 { 173 int i; 174 175 if (!pkt->num_bufs) 176 return; 177 178 /* SKB linear portion is guaranteed to be mapped */ 179 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), 180 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); 181 for (i = 1; i < pkt->num_bufs; i++) { 182 netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]), 183 dma_unmap_len(pkt, len[i]), 184 DMA_TO_DEVICE, 0); 185 } 186 pkt->num_bufs = 0; 187 } 188 189 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. 190 */ 191 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) 192 { 193 int i; 194 195 for (i = 0; i < tx->dqo.num_pending_packets; i++) { 196 struct gve_tx_pending_packet_dqo *cur_state = 197 &tx->dqo.pending_packets[i]; 198 199 if (tx->dqo.qpl) 200 gve_free_tx_qpl_bufs(tx, cur_state); 201 else 202 gve_unmap_packet(tx->dev, cur_state); 203 204 if (cur_state->skb) { 205 dev_consume_skb_any(cur_state->skb); 206 cur_state->skb = NULL; 207 } 208 } 209 } 210 211 void gve_tx_stop_ring_dqo(struct gve_priv *priv, int idx) 212 { 213 int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); 214 struct gve_tx_ring *tx = &priv->tx[idx]; 215 216 if (!gve_tx_was_added_to_block(priv, idx)) 217 return; 218 219 gve_remove_napi(priv, ntfy_idx); 220 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); 221 if (tx->netdev_txq) 222 netdev_tx_reset_queue(tx->netdev_txq); 223 gve_tx_clean_pending_packets(tx); 224 gve_tx_remove_from_block(priv, idx); 225 } 226 227 static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 228 struct gve_tx_alloc_rings_cfg *cfg) 229 { 230 struct device *hdev = &priv->pdev->dev; 231 int idx = tx->q_num; 232 size_t bytes; 233 u32 qpl_id; 234 235 if (tx->q_resources) { 236 dma_free_coherent(hdev, sizeof(*tx->q_resources), 237 tx->q_resources, tx->q_resources_bus); 238 tx->q_resources = NULL; 239 } 240 241 if (tx->dqo.compl_ring) { 242 bytes = sizeof(tx->dqo.compl_ring[0]) * 243 (tx->dqo.complq_mask + 1); 244 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, 245 tx->complq_bus_dqo); 246 tx->dqo.compl_ring = NULL; 247 } 248 249 if (tx->dqo.tx_ring) { 250 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 251 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); 252 tx->dqo.tx_ring = NULL; 253 } 254 255 kvfree(tx->dqo.xsk_reorder_queue); 256 tx->dqo.xsk_reorder_queue = NULL; 257 258 kvfree(tx->dqo.pending_packets); 259 tx->dqo.pending_packets = NULL; 260 261 kvfree(tx->dqo.tx_qpl_buf_next); 262 tx->dqo.tx_qpl_buf_next = NULL; 263 264 if (tx->dqo.qpl) { 265 qpl_id = gve_tx_qpl_id(priv, tx->q_num); 266 gve_free_queue_page_list(priv, tx->dqo.qpl, qpl_id); 267 tx->dqo.qpl = NULL; 268 } 269 270 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); 271 } 272 273 static int gve_tx_qpl_buf_init(struct gve_tx_ring *tx) 274 { 275 int num_tx_qpl_bufs = GVE_TX_BUFS_PER_PAGE_DQO * 276 tx->dqo.qpl->num_entries; 277 int i; 278 279 tx->dqo.tx_qpl_buf_next = kvzalloc_objs(tx->dqo.tx_qpl_buf_next[0], 280 num_tx_qpl_bufs); 281 if (!tx->dqo.tx_qpl_buf_next) 282 return -ENOMEM; 283 284 tx->dqo.num_tx_qpl_bufs = num_tx_qpl_bufs; 285 286 /* Generate free TX buf list */ 287 for (i = 0; i < num_tx_qpl_bufs - 1; i++) 288 tx->dqo.tx_qpl_buf_next[i] = i + 1; 289 tx->dqo.tx_qpl_buf_next[num_tx_qpl_bufs - 1] = -1; 290 291 atomic_set_release(&tx->dqo_compl.free_tx_qpl_buf_head, -1); 292 return 0; 293 } 294 295 void gve_tx_start_ring_dqo(struct gve_priv *priv, int idx) 296 { 297 int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); 298 struct gve_tx_ring *tx = &priv->tx[idx]; 299 300 gve_tx_add_to_block(priv, idx); 301 302 if (idx < priv->tx_cfg.num_queues) 303 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); 304 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo); 305 } 306 307 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, 308 struct gve_tx_alloc_rings_cfg *cfg, 309 struct gve_tx_ring *tx, 310 int idx) 311 { 312 struct device *hdev = &priv->pdev->dev; 313 int num_pending_packets; 314 int qpl_page_cnt; 315 size_t bytes; 316 u32 qpl_id; 317 int i; 318 319 memset(tx, 0, sizeof(*tx)); 320 tx->q_num = idx; 321 tx->dev = hdev; 322 spin_lock_init(&tx->dqo_tx.xdp_lock); 323 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); 324 325 /* Queue sizes must be a power of 2 */ 326 tx->mask = cfg->ring_size - 1; 327 tx->dqo.complq_mask = tx->mask; 328 329 /* The max number of pending packets determines the maximum number of 330 * descriptors which maybe written to the completion queue. 331 * 332 * We must set the number small enough to make sure we never overrun the 333 * completion queue. 334 */ 335 num_pending_packets = tx->dqo.complq_mask + 1; 336 337 /* Reserve space for descriptor completions, which will be reported at 338 * most every GVE_TX_MIN_RE_INTERVAL packets. 339 */ 340 num_pending_packets -= 341 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; 342 343 /* Each packet may have at most 2 buffer completions if it receives both 344 * a miss and reinjection completion. 345 */ 346 num_pending_packets /= 2; 347 348 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); 349 tx->dqo.pending_packets = kvzalloc_objs(tx->dqo.pending_packets[0], 350 tx->dqo.num_pending_packets); 351 if (!tx->dqo.pending_packets) 352 goto err; 353 354 /* Set up linked list of pending packets */ 355 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) 356 tx->dqo.pending_packets[i].next = i + 1; 357 358 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; 359 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); 360 361 /* Only alloc xsk pool for XDP queues */ 362 if (idx >= cfg->qcfg->num_queues && cfg->num_xdp_rings) { 363 tx->dqo.xsk_reorder_queue = 364 kvcalloc(tx->dqo.complq_mask + 1, 365 sizeof(tx->dqo.xsk_reorder_queue[0]), 366 GFP_KERNEL); 367 if (!tx->dqo.xsk_reorder_queue) 368 goto err; 369 } 370 371 tx->dqo_compl.miss_completions.head = -1; 372 tx->dqo_compl.miss_completions.tail = -1; 373 tx->dqo_compl.timed_out_completions.head = -1; 374 tx->dqo_compl.timed_out_completions.tail = -1; 375 376 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 377 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); 378 if (!tx->dqo.tx_ring) 379 goto err; 380 381 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); 382 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, 383 &tx->complq_bus_dqo, 384 GFP_KERNEL); 385 if (!tx->dqo.compl_ring) 386 goto err; 387 388 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), 389 &tx->q_resources_bus, GFP_KERNEL); 390 if (!tx->q_resources) 391 goto err; 392 393 if (!cfg->raw_addressing) { 394 qpl_id = gve_tx_qpl_id(priv, tx->q_num); 395 qpl_page_cnt = priv->tx_pages_per_qpl; 396 397 tx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id, 398 qpl_page_cnt); 399 if (!tx->dqo.qpl) 400 goto err; 401 402 if (gve_tx_qpl_buf_init(tx)) 403 goto err; 404 } 405 406 return 0; 407 408 err: 409 gve_tx_free_ring_dqo(priv, tx, cfg); 410 return -ENOMEM; 411 } 412 413 int gve_tx_alloc_rings_dqo(struct gve_priv *priv, 414 struct gve_tx_alloc_rings_cfg *cfg) 415 { 416 struct gve_tx_ring *tx = cfg->tx; 417 int total_queues; 418 int err = 0; 419 int i, j; 420 421 total_queues = cfg->qcfg->num_queues + cfg->num_xdp_rings; 422 if (total_queues > cfg->qcfg->max_queues) { 423 netif_err(priv, drv, priv->dev, 424 "Cannot alloc more than the max num of Tx rings\n"); 425 return -EINVAL; 426 } 427 428 tx = kvzalloc_objs(struct gve_tx_ring, cfg->qcfg->max_queues); 429 if (!tx) 430 return -ENOMEM; 431 432 for (i = 0; i < total_queues; i++) { 433 err = gve_tx_alloc_ring_dqo(priv, cfg, &tx[i], i); 434 if (err) { 435 netif_err(priv, drv, priv->dev, 436 "Failed to alloc tx ring=%d: err=%d\n", 437 i, err); 438 goto err; 439 } 440 } 441 442 cfg->tx = tx; 443 return 0; 444 445 err: 446 for (j = 0; j < i; j++) 447 gve_tx_free_ring_dqo(priv, &tx[j], cfg); 448 kvfree(tx); 449 return err; 450 } 451 452 void gve_tx_free_rings_dqo(struct gve_priv *priv, 453 struct gve_tx_alloc_rings_cfg *cfg) 454 { 455 struct gve_tx_ring *tx = cfg->tx; 456 int i; 457 458 if (!tx) 459 return; 460 461 for (i = 0; i < cfg->qcfg->num_queues + cfg->qcfg->num_xdp_queues; i++) 462 gve_tx_free_ring_dqo(priv, &tx[i], cfg); 463 464 kvfree(tx); 465 cfg->tx = NULL; 466 } 467 468 /* Returns the number of slots available in the ring */ 469 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) 470 { 471 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; 472 473 return tx->mask - num_used; 474 } 475 476 /* Checks if the requested number of slots are available in the ring */ 477 static bool gve_has_tx_slots_available(struct gve_tx_ring *tx, u32 slots_req) 478 { 479 u32 num_avail = num_avail_tx_slots(tx); 480 481 slots_req += GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP; 482 483 if (num_avail >= slots_req) 484 return true; 485 486 /* Update cached TX head pointer */ 487 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 488 489 return num_avail_tx_slots(tx) >= slots_req; 490 } 491 492 static bool gve_has_avail_slots_tx_dqo(struct gve_tx_ring *tx, 493 int desc_count, int buf_count) 494 { 495 return gve_has_pending_packet(tx) && 496 gve_has_tx_slots_available(tx, desc_count) && 497 gve_has_free_tx_qpl_bufs(tx, buf_count); 498 } 499 500 /* Stops the queue if available descriptors is less than 'count'. 501 * Return: 0 if stop is not required. 502 */ 503 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, 504 int desc_count, int buf_count) 505 { 506 if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) 507 return 0; 508 509 /* No space, so stop the queue */ 510 tx->stop_queue++; 511 netif_tx_stop_queue(tx->netdev_txq); 512 513 /* Sync with restarting queue in `gve_tx_poll_dqo()` */ 514 mb(); 515 516 /* After stopping queue, check if we can transmit again in order to 517 * avoid TOCTOU bug. 518 */ 519 if (likely(!gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) 520 return -EBUSY; 521 522 netif_tx_start_queue(tx->netdev_txq); 523 tx->wake_queue++; 524 return 0; 525 } 526 527 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, 528 struct gve_tx_metadata_dqo *metadata) 529 { 530 memset(metadata, 0, sizeof(*metadata)); 531 metadata->version = GVE_TX_METADATA_VERSION_DQO; 532 533 if (skb->l4_hash) { 534 u16 path_hash = skb->hash ^ (skb->hash >> 16); 535 536 path_hash &= (1 << 15) - 1; 537 if (unlikely(path_hash == 0)) 538 path_hash = ~path_hash; 539 540 metadata->path_hash = path_hash; 541 } 542 } 543 544 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, 545 bool enable_csum, u32 len, u64 addr, 546 s16 compl_tag, bool eop, bool is_gso) 547 { 548 while (len > 0) { 549 struct gve_tx_pkt_desc_dqo *desc = 550 &tx->dqo.tx_ring[*desc_idx].pkt; 551 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); 552 bool cur_eop = eop && cur_len == len; 553 554 *desc = (struct gve_tx_pkt_desc_dqo){ 555 .buf_addr = cpu_to_le64(addr), 556 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, 557 .end_of_packet = cur_eop, 558 .checksum_offload_enable = enable_csum, 559 .compl_tag = cpu_to_le16(compl_tag), 560 .buf_size = cur_len, 561 }; 562 563 addr += cur_len; 564 len -= cur_len; 565 *desc_idx = (*desc_idx + 1) & tx->mask; 566 } 567 } 568 569 /* Validates and prepares `skb` for TSO. 570 * 571 * Returns header length, or < 0 if invalid. 572 */ 573 static int gve_prep_tso(struct sk_buff *skb) 574 { 575 struct tcphdr *tcp; 576 int header_len; 577 u32 paylen; 578 int err; 579 580 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length 581 * of the TSO to be <= 262143. 582 * 583 * However, we don't validate these because: 584 * - Hypervisor enforces a limit of 9K MTU 585 * - Kernel will not produce a TSO larger than 64k 586 */ 587 588 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) 589 return -1; 590 591 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 592 return -EINVAL; 593 594 /* Needed because we will modify header. */ 595 err = skb_cow_head(skb, 0); 596 if (err < 0) 597 return err; 598 599 tcp = tcp_hdr(skb); 600 paylen = skb->len - skb_transport_offset(skb); 601 csum_replace_by_diff(&tcp->check, (__force __wsum)htonl(paylen)); 602 header_len = skb_tcp_all_headers(skb); 603 604 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) 605 return -EINVAL; 606 607 return header_len; 608 } 609 610 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, 611 const struct sk_buff *skb, 612 const struct gve_tx_metadata_dqo *metadata, 613 int header_len) 614 { 615 *desc = (struct gve_tx_tso_context_desc_dqo){ 616 .header_len = header_len, 617 .cmd_dtype = { 618 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, 619 .tso = 1, 620 }, 621 .flex0 = metadata->bytes[0], 622 .flex5 = metadata->bytes[5], 623 .flex6 = metadata->bytes[6], 624 .flex7 = metadata->bytes[7], 625 .flex8 = metadata->bytes[8], 626 .flex9 = metadata->bytes[9], 627 .flex10 = metadata->bytes[10], 628 .flex11 = metadata->bytes[11], 629 }; 630 desc->tso_total_len = skb->len - header_len; 631 desc->mss = skb_shinfo(skb)->gso_size; 632 } 633 634 static void 635 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, 636 const struct gve_tx_metadata_dqo *metadata) 637 { 638 *desc = (struct gve_tx_general_context_desc_dqo){ 639 .flex0 = metadata->bytes[0], 640 .flex1 = metadata->bytes[1], 641 .flex2 = metadata->bytes[2], 642 .flex3 = metadata->bytes[3], 643 .flex4 = metadata->bytes[4], 644 .flex5 = metadata->bytes[5], 645 .flex6 = metadata->bytes[6], 646 .flex7 = metadata->bytes[7], 647 .flex8 = metadata->bytes[8], 648 .flex9 = metadata->bytes[9], 649 .flex10 = metadata->bytes[10], 650 .flex11 = metadata->bytes[11], 651 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, 652 }; 653 } 654 655 static void gve_tx_update_tail(struct gve_tx_ring *tx, u32 desc_idx) 656 { 657 u32 last_desc_idx = (desc_idx - 1) & tx->mask; 658 u32 last_report_event_interval = 659 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; 660 661 /* Commit the changes to our state */ 662 tx->dqo_tx.tail = desc_idx; 663 664 /* Request a descriptor completion on the last descriptor of the 665 * packet if we are allowed to by the HW enforced interval. 666 */ 667 668 if (unlikely(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { 669 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; 670 tx->dqo_tx.last_re_idx = last_desc_idx; 671 } 672 } 673 674 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, 675 struct sk_buff *skb, 676 struct gve_tx_pending_packet_dqo *pkt, 677 s16 completion_tag, 678 u32 *desc_idx, 679 bool is_gso) 680 { 681 bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL; 682 const struct skb_shared_info *shinfo = skb_shinfo(skb); 683 int i; 684 685 /* Note: HW requires that the size of a non-TSO packet be within the 686 * range of [17, 9728]. 687 * 688 * We don't double check because 689 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. 690 * - Hypervisor won't allow MTU larger than 9216. 691 */ 692 693 pkt->num_bufs = 0; 694 /* Map the linear portion of skb */ 695 { 696 u32 len = skb_headlen(skb); 697 dma_addr_t addr; 698 699 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); 700 if (unlikely(dma_mapping_error(tx->dev, addr))) 701 goto err; 702 703 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 704 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 705 ++pkt->num_bufs; 706 707 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr, 708 completion_tag, 709 /*eop=*/shinfo->nr_frags == 0, is_gso); 710 } 711 712 for (i = 0; i < shinfo->nr_frags; i++) { 713 const skb_frag_t *frag = &shinfo->frags[i]; 714 bool is_eop = i == (shinfo->nr_frags - 1); 715 u32 len = skb_frag_size(frag); 716 dma_addr_t addr; 717 718 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); 719 if (unlikely(dma_mapping_error(tx->dev, addr))) 720 goto err; 721 722 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 723 netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt, 724 dma[pkt->num_bufs], addr); 725 ++pkt->num_bufs; 726 727 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr, 728 completion_tag, is_eop, is_gso); 729 } 730 731 return 0; 732 err: 733 for (i = 0; i < pkt->num_bufs; i++) { 734 if (i == 0) { 735 dma_unmap_single(tx->dev, 736 dma_unmap_addr(pkt, dma[i]), 737 dma_unmap_len(pkt, len[i]), 738 DMA_TO_DEVICE); 739 } else { 740 dma_unmap_page(tx->dev, 741 dma_unmap_addr(pkt, dma[i]), 742 dma_unmap_len(pkt, len[i]), 743 DMA_TO_DEVICE); 744 } 745 } 746 pkt->num_bufs = 0; 747 return -1; 748 } 749 750 /* Tx buffer i corresponds to 751 * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO 752 * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO 753 */ 754 static void gve_tx_buf_get_addr(struct gve_tx_ring *tx, 755 s16 index, 756 void **va, dma_addr_t *dma_addr) 757 { 758 int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); 759 int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; 760 761 *va = page_address(tx->dqo.qpl->pages[page_id]) + offset; 762 *dma_addr = tx->dqo.qpl->page_buses[page_id] + offset; 763 } 764 765 static int gve_tx_add_skb_copy_dqo(struct gve_tx_ring *tx, 766 struct sk_buff *skb, 767 struct gve_tx_pending_packet_dqo *pkt, 768 s16 completion_tag, 769 u32 *desc_idx, 770 bool is_gso) 771 { 772 bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL; 773 u32 copy_offset = 0; 774 dma_addr_t dma_addr; 775 u32 copy_len; 776 s16 index; 777 void *va; 778 779 /* Break the packet into buffer size chunks */ 780 pkt->num_bufs = 0; 781 while (copy_offset < skb->len) { 782 index = gve_alloc_tx_qpl_buf(tx); 783 if (unlikely(index == -1)) 784 goto err; 785 786 gve_tx_buf_get_addr(tx, index, &va, &dma_addr); 787 copy_len = min_t(u32, GVE_TX_BUF_SIZE_DQO, 788 skb->len - copy_offset); 789 skb_copy_bits(skb, copy_offset, va, copy_len); 790 791 copy_offset += copy_len; 792 dma_sync_single_for_device(tx->dev, dma_addr, 793 copy_len, DMA_TO_DEVICE); 794 gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, 795 copy_len, 796 dma_addr, 797 completion_tag, 798 copy_offset == skb->len, 799 is_gso); 800 801 pkt->tx_qpl_buf_ids[pkt->num_bufs] = index; 802 ++tx->dqo_tx.alloc_tx_qpl_buf_cnt; 803 ++pkt->num_bufs; 804 } 805 806 return 0; 807 err: 808 /* Should not be here if gve_has_free_tx_qpl_bufs() check is correct */ 809 gve_free_tx_qpl_bufs(tx, pkt); 810 return -ENOMEM; 811 } 812 813 /* Returns 0 on success, or < 0 on error. 814 * 815 * Before this function is called, the caller must ensure 816 * gve_has_pending_packet(tx) returns true. 817 */ 818 static int gve_tx_add_skb_dqo(struct gve_tx_ring *tx, 819 struct sk_buff *skb) 820 { 821 const bool is_gso = skb_is_gso(skb); 822 u32 desc_idx = tx->dqo_tx.tail; 823 struct gve_tx_pending_packet_dqo *pkt; 824 struct gve_tx_metadata_dqo metadata; 825 s16 completion_tag; 826 827 pkt = gve_alloc_pending_packet(tx); 828 if (!pkt) 829 return -ENOMEM; 830 831 pkt->skb = skb; 832 pkt->type = GVE_TX_PENDING_PACKET_DQO_SKB; 833 completion_tag = pkt - tx->dqo.pending_packets; 834 835 gve_extract_tx_metadata_dqo(skb, &metadata); 836 if (is_gso) { 837 int header_len = gve_prep_tso(skb); 838 839 if (unlikely(header_len < 0)) 840 goto err; 841 842 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, 843 skb, &metadata, header_len); 844 desc_idx = (desc_idx + 1) & tx->mask; 845 } 846 847 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, 848 &metadata); 849 desc_idx = (desc_idx + 1) & tx->mask; 850 851 if (tx->dqo.qpl) { 852 if (gve_tx_add_skb_copy_dqo(tx, skb, pkt, 853 completion_tag, 854 &desc_idx, is_gso)) 855 goto err; 856 } else { 857 if (gve_tx_add_skb_no_copy_dqo(tx, skb, pkt, 858 completion_tag, 859 &desc_idx, is_gso)) 860 goto err; 861 } 862 863 tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs; 864 865 gve_tx_update_tail(tx, desc_idx); 866 return 0; 867 868 err: 869 pkt->skb = NULL; 870 gve_free_pending_packet(tx, pkt); 871 872 return -1; 873 } 874 875 static int gve_num_descs_per_buf(size_t size) 876 { 877 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); 878 } 879 880 static int gve_num_buffer_descs_needed(const struct sk_buff *skb) 881 { 882 const struct skb_shared_info *shinfo = skb_shinfo(skb); 883 int num_descs; 884 int i; 885 886 num_descs = gve_num_descs_per_buf(skb_headlen(skb)); 887 888 for (i = 0; i < shinfo->nr_frags; i++) { 889 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); 890 891 num_descs += gve_num_descs_per_buf(frag_size); 892 } 893 894 return num_descs; 895 } 896 897 /* Returns true if HW is capable of sending TSO represented by `skb`. 898 * 899 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. 900 * - The header is counted as one buffer for every single segment. 901 * - A buffer which is split between two segments is counted for both. 902 * - If a buffer contains both header and payload, it is counted as two buffers. 903 */ 904 static bool gve_can_send_tso(const struct sk_buff *skb) 905 { 906 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; 907 const struct skb_shared_info *shinfo = skb_shinfo(skb); 908 const int header_len = skb_tcp_all_headers(skb); 909 const int gso_size = shinfo->gso_size; 910 int cur_seg_num_bufs; 911 int prev_frag_size; 912 int cur_seg_size; 913 int i; 914 915 cur_seg_size = skb_headlen(skb) - header_len; 916 prev_frag_size = skb_headlen(skb); 917 cur_seg_num_bufs = cur_seg_size > 0; 918 919 for (i = 0; i < shinfo->nr_frags; i++) { 920 if (cur_seg_size >= gso_size) { 921 cur_seg_size %= gso_size; 922 cur_seg_num_bufs = cur_seg_size > 0; 923 924 if (prev_frag_size > GVE_TX_MAX_BUF_SIZE_DQO) { 925 int prev_frag_remain = prev_frag_size % 926 GVE_TX_MAX_BUF_SIZE_DQO; 927 928 /* If the last descriptor of the previous frag 929 * is less than cur_seg_size, the segment will 930 * span two descriptors in the previous frag. 931 * Since max gso size (9728) is less than 932 * GVE_TX_MAX_BUF_SIZE_DQO, it is impossible 933 * for the segment to span more than two 934 * descriptors. 935 */ 936 if (prev_frag_remain && 937 cur_seg_size > prev_frag_remain) 938 cur_seg_num_bufs++; 939 } 940 } 941 942 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) 943 return false; 944 945 prev_frag_size = skb_frag_size(&shinfo->frags[i]); 946 cur_seg_size += prev_frag_size; 947 } 948 949 return true; 950 } 951 952 netdev_features_t gve_features_check_dqo(struct sk_buff *skb, 953 struct net_device *dev, 954 netdev_features_t features) 955 { 956 if (skb_is_gso(skb) && !gve_can_send_tso(skb)) 957 return features & ~NETIF_F_GSO_MASK; 958 959 return features; 960 } 961 962 /* Attempt to transmit specified SKB. 963 * 964 * Returns 0 if the SKB was transmitted or dropped. 965 * Returns -1 if there is not currently enough space to transmit the SKB. 966 */ 967 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, 968 struct sk_buff *skb) 969 { 970 int num_buffer_descs; 971 int total_num_descs; 972 973 if (tx->dqo.qpl) { 974 /* We do not need to verify the number of buffers used per 975 * packet or per segment in case of TSO as with 2K size buffers 976 * none of the TX packet rules would be violated. 977 * 978 * gve_can_send_tso() checks that each TCP segment of gso_size is 979 * not distributed over more than 9 SKB frags.. 980 */ 981 num_buffer_descs = DIV_ROUND_UP(skb->len, GVE_TX_BUF_SIZE_DQO); 982 } else { 983 num_buffer_descs = gve_num_buffer_descs_needed(skb); 984 if (!skb_is_gso(skb)) { 985 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { 986 if (unlikely(skb_linearize(skb) < 0)) 987 goto drop; 988 989 num_buffer_descs = 1; 990 } 991 } 992 } 993 994 /* Metadata + (optional TSO) + data descriptors. */ 995 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; 996 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs, 997 num_buffer_descs))) { 998 return -1; 999 } 1000 1001 if (unlikely(gve_tx_add_skb_dqo(tx, skb) < 0)) 1002 goto drop; 1003 1004 netdev_tx_sent_queue(tx->netdev_txq, skb->len); 1005 skb_tx_timestamp(skb); 1006 return 0; 1007 1008 drop: 1009 u64_stats_update_begin(&tx->statss); 1010 tx->dropped_pkt++; 1011 u64_stats_update_end(&tx->statss); 1012 dev_kfree_skb_any(skb); 1013 return 0; 1014 } 1015 1016 static void gve_xsk_reorder_queue_push_dqo(struct gve_tx_ring *tx, 1017 u16 completion_tag) 1018 { 1019 u32 tail = atomic_read(&tx->dqo_tx.xsk_reorder_queue_tail); 1020 1021 tx->dqo.xsk_reorder_queue[tail] = completion_tag; 1022 tail = (tail + 1) & tx->dqo.complq_mask; 1023 atomic_set_release(&tx->dqo_tx.xsk_reorder_queue_tail, tail); 1024 } 1025 1026 static struct gve_tx_pending_packet_dqo * 1027 gve_xsk_reorder_queue_head(struct gve_tx_ring *tx) 1028 { 1029 u32 head = tx->dqo_compl.xsk_reorder_queue_head; 1030 1031 if (head == tx->dqo_compl.xsk_reorder_queue_tail) { 1032 tx->dqo_compl.xsk_reorder_queue_tail = 1033 atomic_read_acquire(&tx->dqo_tx.xsk_reorder_queue_tail); 1034 1035 if (head == tx->dqo_compl.xsk_reorder_queue_tail) 1036 return NULL; 1037 } 1038 1039 return &tx->dqo.pending_packets[tx->dqo.xsk_reorder_queue[head]]; 1040 } 1041 1042 static void gve_xsk_reorder_queue_pop_dqo(struct gve_tx_ring *tx) 1043 { 1044 tx->dqo_compl.xsk_reorder_queue_head++; 1045 tx->dqo_compl.xsk_reorder_queue_head &= tx->dqo.complq_mask; 1046 } 1047 1048 /* Transmit a given skb and ring the doorbell. */ 1049 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) 1050 { 1051 struct gve_priv *priv = netdev_priv(dev); 1052 struct gve_tx_ring *tx; 1053 1054 tx = &priv->tx[skb_get_queue_mapping(skb)]; 1055 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { 1056 /* We need to ring the txq doorbell -- we have stopped the Tx 1057 * queue for want of resources, but prior calls to gve_tx() 1058 * may have added descriptors without ringing the doorbell. 1059 */ 1060 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1061 return NETDEV_TX_BUSY; 1062 } 1063 1064 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) 1065 return NETDEV_TX_OK; 1066 1067 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1068 return NETDEV_TX_OK; 1069 } 1070 1071 static bool gve_xsk_tx_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1072 int budget) 1073 { 1074 struct xsk_buff_pool *pool = tx->xsk_pool; 1075 struct xdp_desc desc; 1076 bool repoll = false; 1077 int sent = 0; 1078 1079 spin_lock(&tx->dqo_tx.xdp_lock); 1080 for (; sent < budget; sent++) { 1081 struct gve_tx_pending_packet_dqo *pkt; 1082 s16 completion_tag; 1083 dma_addr_t addr; 1084 u32 desc_idx; 1085 1086 if (unlikely(!gve_has_avail_slots_tx_dqo(tx, 1, 1))) { 1087 repoll = true; 1088 break; 1089 } 1090 1091 if (!xsk_tx_peek_desc(pool, &desc)) 1092 break; 1093 1094 pkt = gve_alloc_pending_packet(tx); 1095 pkt->type = GVE_TX_PENDING_PACKET_DQO_XSK; 1096 pkt->num_bufs = 0; 1097 completion_tag = pkt - tx->dqo.pending_packets; 1098 1099 addr = xsk_buff_raw_get_dma(pool, desc.addr); 1100 xsk_buff_raw_dma_sync_for_device(pool, addr, desc.len); 1101 1102 desc_idx = tx->dqo_tx.tail; 1103 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, 1104 true, desc.len, 1105 addr, completion_tag, true, 1106 false); 1107 ++pkt->num_bufs; 1108 gve_tx_update_tail(tx, desc_idx); 1109 tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs; 1110 gve_xsk_reorder_queue_push_dqo(tx, completion_tag); 1111 } 1112 1113 if (sent) { 1114 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1115 xsk_tx_release(pool); 1116 } 1117 1118 spin_unlock(&tx->dqo_tx.xdp_lock); 1119 1120 u64_stats_update_begin(&tx->statss); 1121 tx->xdp_xsk_sent += sent; 1122 u64_stats_update_end(&tx->statss); 1123 1124 return (sent == budget) || repoll; 1125 } 1126 1127 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, 1128 struct gve_tx_pending_packet_dqo *pending_packet) 1129 { 1130 s16 old_tail, index; 1131 1132 index = pending_packet - tx->dqo.pending_packets; 1133 old_tail = list->tail; 1134 list->tail = index; 1135 if (old_tail == -1) 1136 list->head = index; 1137 else 1138 tx->dqo.pending_packets[old_tail].next = index; 1139 1140 pending_packet->next = -1; 1141 pending_packet->prev = old_tail; 1142 } 1143 1144 static void remove_from_list(struct gve_tx_ring *tx, 1145 struct gve_index_list *list, 1146 struct gve_tx_pending_packet_dqo *pkt) 1147 { 1148 s16 prev_index, next_index; 1149 1150 prev_index = pkt->prev; 1151 next_index = pkt->next; 1152 1153 if (prev_index == -1) { 1154 /* Node is head */ 1155 list->head = next_index; 1156 } else { 1157 tx->dqo.pending_packets[prev_index].next = next_index; 1158 } 1159 if (next_index == -1) { 1160 /* Node is tail */ 1161 list->tail = prev_index; 1162 } else { 1163 tx->dqo.pending_packets[next_index].prev = prev_index; 1164 } 1165 } 1166 1167 /* Completion types and expected behavior: 1168 * No Miss compl + Packet compl = Packet completed normally. 1169 * Miss compl + Re-inject compl = Packet completed normally. 1170 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. 1171 * Miss compl + Packet compl = Skipped i.e. packet not completed. 1172 */ 1173 static void gve_handle_packet_completion(struct gve_priv *priv, 1174 struct gve_tx_ring *tx, bool is_napi, 1175 u16 compl_tag, u64 *bytes, u64 *pkts, 1176 bool is_reinjection) 1177 { 1178 struct gve_tx_pending_packet_dqo *pending_packet; 1179 1180 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 1181 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 1182 priv->dev->name, (int)compl_tag); 1183 return; 1184 } 1185 1186 pending_packet = &tx->dqo.pending_packets[compl_tag]; 1187 1188 if (unlikely(is_reinjection)) { 1189 if (unlikely(pending_packet->state == 1190 GVE_PACKET_STATE_TIMED_OUT_COMPL)) { 1191 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", 1192 priv->dev->name, (int)compl_tag); 1193 /* Packet was already completed as a result of timeout, 1194 * so just remove from list and free pending packet. 1195 */ 1196 remove_from_list(tx, 1197 &tx->dqo_compl.timed_out_completions, 1198 pending_packet); 1199 gve_free_pending_packet(tx, pending_packet); 1200 return; 1201 } 1202 if (unlikely(pending_packet->state != 1203 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { 1204 /* No outstanding miss completion but packet allocated 1205 * implies packet receives a re-injection completion 1206 * without a prior miss completion. Return without 1207 * completing the packet. 1208 */ 1209 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", 1210 priv->dev->name, (int)compl_tag); 1211 return; 1212 } 1213 remove_from_list(tx, &tx->dqo_compl.miss_completions, 1214 pending_packet); 1215 } else { 1216 /* Packet is allocated but not a pending data completion. */ 1217 if (unlikely(pending_packet->state != 1218 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 1219 net_err_ratelimited("%s: No pending data completion: %d\n", 1220 priv->dev->name, (int)compl_tag); 1221 return; 1222 } 1223 } 1224 tx->dqo_tx.completed_packet_desc_cnt += pending_packet->num_bufs; 1225 1226 switch (pending_packet->type) { 1227 case GVE_TX_PENDING_PACKET_DQO_SKB: 1228 if (tx->dqo.qpl) 1229 gve_free_tx_qpl_bufs(tx, pending_packet); 1230 else 1231 gve_unmap_packet(tx->dev, pending_packet); 1232 (*pkts)++; 1233 *bytes += pending_packet->skb->len; 1234 1235 napi_consume_skb(pending_packet->skb, is_napi); 1236 pending_packet->skb = NULL; 1237 gve_free_pending_packet(tx, pending_packet); 1238 break; 1239 case GVE_TX_PENDING_PACKET_DQO_XDP_FRAME: 1240 gve_unmap_packet(tx->dev, pending_packet); 1241 (*pkts)++; 1242 *bytes += pending_packet->xdpf->len; 1243 1244 xdp_return_frame(pending_packet->xdpf); 1245 pending_packet->xdpf = NULL; 1246 gve_free_pending_packet(tx, pending_packet); 1247 break; 1248 case GVE_TX_PENDING_PACKET_DQO_XSK: 1249 pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE; 1250 break; 1251 default: 1252 WARN_ON_ONCE(1); 1253 } 1254 } 1255 1256 static void gve_handle_miss_completion(struct gve_priv *priv, 1257 struct gve_tx_ring *tx, u16 compl_tag, 1258 u64 *bytes, u64 *pkts) 1259 { 1260 struct gve_tx_pending_packet_dqo *pending_packet; 1261 1262 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 1263 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 1264 priv->dev->name, (int)compl_tag); 1265 return; 1266 } 1267 1268 pending_packet = &tx->dqo.pending_packets[compl_tag]; 1269 if (unlikely(pending_packet->state != 1270 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 1271 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", 1272 priv->dev->name, (int)pending_packet->state, 1273 (int)compl_tag); 1274 return; 1275 } 1276 1277 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; 1278 /* jiffies can wraparound but time comparisons can handle overflows. */ 1279 pending_packet->timeout_jiffies = 1280 jiffies + 1281 secs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT); 1282 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); 1283 1284 *bytes += pending_packet->skb->len; 1285 (*pkts)++; 1286 } 1287 1288 static void remove_miss_completions(struct gve_priv *priv, 1289 struct gve_tx_ring *tx) 1290 { 1291 struct gve_tx_pending_packet_dqo *pending_packet; 1292 s16 next_index; 1293 1294 next_index = tx->dqo_compl.miss_completions.head; 1295 while (next_index != -1) { 1296 pending_packet = &tx->dqo.pending_packets[next_index]; 1297 next_index = pending_packet->next; 1298 /* Break early because packets should timeout in order. */ 1299 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 1300 break; 1301 1302 remove_from_list(tx, &tx->dqo_compl.miss_completions, 1303 pending_packet); 1304 /* Unmap/free TX buffers and free skb but do not unallocate packet i.e. 1305 * the completion tag is not freed to ensure that the driver 1306 * can take appropriate action if a corresponding valid 1307 * completion is received later. 1308 */ 1309 if (tx->dqo.qpl) 1310 gve_free_tx_qpl_bufs(tx, pending_packet); 1311 else 1312 gve_unmap_packet(tx->dev, pending_packet); 1313 1314 /* This indicates the packet was dropped. */ 1315 dev_kfree_skb_any(pending_packet->skb); 1316 pending_packet->skb = NULL; 1317 1318 u64_stats_update_begin(&tx->statss); 1319 tx->dropped_pkt++; 1320 u64_stats_update_end(&tx->statss); 1321 1322 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", 1323 priv->dev->name, 1324 (int)(pending_packet - tx->dqo.pending_packets)); 1325 1326 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; 1327 pending_packet->timeout_jiffies = 1328 jiffies + 1329 secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT); 1330 /* Maintain pending packet in another list so the packet can be 1331 * unallocated at a later time. 1332 */ 1333 add_to_list(tx, &tx->dqo_compl.timed_out_completions, 1334 pending_packet); 1335 } 1336 } 1337 1338 static void remove_timed_out_completions(struct gve_priv *priv, 1339 struct gve_tx_ring *tx) 1340 { 1341 struct gve_tx_pending_packet_dqo *pending_packet; 1342 s16 next_index; 1343 1344 next_index = tx->dqo_compl.timed_out_completions.head; 1345 while (next_index != -1) { 1346 pending_packet = &tx->dqo.pending_packets[next_index]; 1347 next_index = pending_packet->next; 1348 /* Break early because packets should timeout in order. */ 1349 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 1350 break; 1351 1352 remove_from_list(tx, &tx->dqo_compl.timed_out_completions, 1353 pending_packet); 1354 1355 /* Need to count XSK packets in xsk_tx_completed. */ 1356 if (pending_packet->type == GVE_TX_PENDING_PACKET_DQO_XSK) 1357 pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE; 1358 else 1359 gve_free_pending_packet(tx, pending_packet); 1360 } 1361 } 1362 1363 static void gve_tx_process_xsk_completions(struct gve_tx_ring *tx) 1364 { 1365 u32 num_xsks = 0; 1366 1367 while (true) { 1368 struct gve_tx_pending_packet_dqo *pending_packet = 1369 gve_xsk_reorder_queue_head(tx); 1370 1371 if (!pending_packet || 1372 pending_packet->state != GVE_PACKET_STATE_XSK_COMPLETE) 1373 break; 1374 1375 num_xsks++; 1376 gve_xsk_reorder_queue_pop_dqo(tx); 1377 gve_free_pending_packet(tx, pending_packet); 1378 } 1379 1380 if (num_xsks) 1381 xsk_tx_completed(tx->xsk_pool, num_xsks); 1382 } 1383 1384 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1385 struct napi_struct *napi) 1386 { 1387 u64 reinject_compl_bytes = 0; 1388 u64 reinject_compl_pkts = 0; 1389 int num_descs_cleaned = 0; 1390 u64 miss_compl_bytes = 0; 1391 u64 miss_compl_pkts = 0; 1392 u64 pkt_compl_bytes = 0; 1393 u64 pkt_compl_pkts = 0; 1394 1395 /* Limit in order to avoid blocking for too long */ 1396 while (!napi || pkt_compl_pkts < napi->weight) { 1397 struct gve_tx_compl_desc *compl_desc = 1398 &tx->dqo.compl_ring[tx->dqo_compl.head]; 1399 u16 type; 1400 1401 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) 1402 break; 1403 1404 /* Prefetch the next descriptor. */ 1405 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & 1406 tx->dqo.complq_mask]); 1407 1408 /* Do not read data until we own the descriptor */ 1409 dma_rmb(); 1410 type = compl_desc->type; 1411 1412 if (type == GVE_COMPL_TYPE_DQO_DESC) { 1413 /* This is the last descriptor fetched by HW plus one */ 1414 u16 tx_head = le16_to_cpu(compl_desc->tx_head); 1415 1416 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); 1417 } else if (type == GVE_COMPL_TYPE_DQO_PKT) { 1418 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1419 if (compl_tag & GVE_ALT_MISS_COMPL_BIT) { 1420 compl_tag &= ~GVE_ALT_MISS_COMPL_BIT; 1421 gve_handle_miss_completion(priv, tx, compl_tag, 1422 &miss_compl_bytes, 1423 &miss_compl_pkts); 1424 } else { 1425 gve_handle_packet_completion(priv, tx, !!napi, 1426 compl_tag, 1427 &pkt_compl_bytes, 1428 &pkt_compl_pkts, 1429 false); 1430 } 1431 } else if (type == GVE_COMPL_TYPE_DQO_MISS) { 1432 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1433 1434 gve_handle_miss_completion(priv, tx, compl_tag, 1435 &miss_compl_bytes, 1436 &miss_compl_pkts); 1437 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { 1438 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 1439 1440 gve_handle_packet_completion(priv, tx, !!napi, 1441 compl_tag, 1442 &reinject_compl_bytes, 1443 &reinject_compl_pkts, 1444 true); 1445 } 1446 1447 tx->dqo_compl.head = 1448 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; 1449 /* Flip the generation bit when we wrap around */ 1450 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; 1451 num_descs_cleaned++; 1452 } 1453 1454 if (tx->netdev_txq) 1455 netdev_tx_completed_queue(tx->netdev_txq, 1456 pkt_compl_pkts + miss_compl_pkts, 1457 pkt_compl_bytes + miss_compl_bytes); 1458 1459 remove_miss_completions(priv, tx); 1460 remove_timed_out_completions(priv, tx); 1461 1462 if (tx->xsk_pool) 1463 gve_tx_process_xsk_completions(tx); 1464 1465 u64_stats_update_begin(&tx->statss); 1466 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; 1467 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; 1468 u64_stats_update_end(&tx->statss); 1469 return num_descs_cleaned; 1470 } 1471 1472 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) 1473 { 1474 struct gve_tx_compl_desc *compl_desc; 1475 struct gve_tx_ring *tx = block->tx; 1476 struct gve_priv *priv = block->priv; 1477 1478 if (do_clean) { 1479 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, 1480 &block->napi); 1481 1482 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ 1483 mb(); 1484 1485 if (netif_tx_queue_stopped(tx->netdev_txq) && 1486 num_descs_cleaned > 0) { 1487 tx->wake_queue++; 1488 netif_tx_wake_queue(tx->netdev_txq); 1489 } 1490 } 1491 1492 /* Return true if we still have work. */ 1493 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1494 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1495 } 1496 1497 bool gve_xsk_tx_poll_dqo(struct gve_notify_block *rx_block, int budget) 1498 { 1499 struct gve_rx_ring *rx = rx_block->rx; 1500 struct gve_priv *priv = rx->gve; 1501 struct gve_tx_ring *tx; 1502 1503 tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; 1504 if (tx->xsk_pool) 1505 return gve_xsk_tx_dqo(priv, tx, budget); 1506 1507 return 0; 1508 } 1509 1510 bool gve_xdp_poll_dqo(struct gve_notify_block *block) 1511 { 1512 struct gve_tx_compl_desc *compl_desc; 1513 struct gve_tx_ring *tx = block->tx; 1514 struct gve_priv *priv = block->priv; 1515 1516 gve_clean_tx_done_dqo(priv, tx, &block->napi); 1517 1518 /* Return true if we still have work. */ 1519 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1520 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1521 } 1522 1523 int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 1524 struct xdp_frame *xdpf) 1525 { 1526 struct gve_tx_pending_packet_dqo *pkt; 1527 u32 desc_idx = tx->dqo_tx.tail; 1528 s16 completion_tag; 1529 int num_descs = 1; 1530 dma_addr_t addr; 1531 int err; 1532 1533 if (unlikely(!gve_has_tx_slots_available(tx, num_descs))) 1534 return -EBUSY; 1535 1536 pkt = gve_alloc_pending_packet(tx); 1537 if (unlikely(!pkt)) 1538 return -EBUSY; 1539 1540 pkt->type = GVE_TX_PENDING_PACKET_DQO_XDP_FRAME; 1541 pkt->num_bufs = 0; 1542 pkt->xdpf = xdpf; 1543 completion_tag = pkt - tx->dqo.pending_packets; 1544 1545 /* Generate Packet Descriptor */ 1546 addr = dma_map_single(tx->dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); 1547 err = dma_mapping_error(tx->dev, addr); 1548 if (unlikely(err)) 1549 goto err; 1550 1551 dma_unmap_len_set(pkt, len[pkt->num_bufs], xdpf->len); 1552 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 1553 pkt->num_bufs++; 1554 1555 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, 1556 false, xdpf->len, 1557 addr, completion_tag, true, 1558 false); 1559 1560 gve_tx_update_tail(tx, desc_idx); 1561 return 0; 1562 1563 err: 1564 pkt->xdpf = NULL; 1565 pkt->num_bufs = 0; 1566 gve_free_pending_packet(tx, pkt); 1567 return err; 1568 } 1569 1570 int gve_xdp_xmit_dqo(struct net_device *dev, int n, struct xdp_frame **frames, 1571 u32 flags) 1572 { 1573 struct gve_priv *priv = netdev_priv(dev); 1574 struct gve_tx_ring *tx; 1575 int i, err = 0, qid; 1576 1577 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 1578 return -EINVAL; 1579 1580 qid = gve_xdp_tx_queue_id(priv, 1581 smp_processor_id() % priv->tx_cfg.num_xdp_queues); 1582 1583 tx = &priv->tx[qid]; 1584 1585 spin_lock(&tx->dqo_tx.xdp_lock); 1586 for (i = 0; i < n; i++) { 1587 err = gve_xdp_xmit_one_dqo(priv, tx, frames[i]); 1588 if (err) 1589 break; 1590 } 1591 1592 if (flags & XDP_XMIT_FLUSH) 1593 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 1594 1595 spin_unlock(&tx->dqo_tx.xdp_lock); 1596 1597 u64_stats_update_begin(&tx->statss); 1598 tx->xdp_xmit += n; 1599 tx->xdp_xmit_errors += n - i; 1600 u64_stats_update_end(&tx->statss); 1601 1602 return i ? i : err; 1603 } 1604