1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, 27 struct gve_rx_ring *rx, 28 struct gve_rx_alloc_rings_cfg *cfg) 29 { 30 u32 slots = rx->mask + 1; 31 int i; 32 33 if (rx->data.raw_addressing) { 34 for (i = 0; i < slots; i++) 35 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 36 &rx->data.data_ring[i]); 37 } else { 38 for (i = 0; i < slots; i++) 39 page_ref_sub(rx->data.page_info[i].page, 40 rx->data.page_info[i].pagecnt_bias - 1); 41 rx->data.qpl = NULL; 42 43 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 44 page_ref_sub(rx->qpl_copy_pool[i].page, 45 rx->qpl_copy_pool[i].pagecnt_bias - 1); 46 put_page(rx->qpl_copy_pool[i].page); 47 } 48 } 49 kvfree(rx->data.page_info); 50 rx->data.page_info = NULL; 51 } 52 53 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx) 54 { 55 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 56 57 if (!gve_rx_was_added_to_block(priv, idx)) 58 return; 59 60 gve_remove_napi(priv, ntfy_idx); 61 gve_rx_remove_from_block(priv, idx); 62 } 63 64 static void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx, 65 struct gve_rx_alloc_rings_cfg *cfg) 66 { 67 struct device *dev = &priv->pdev->dev; 68 u32 slots = rx->mask + 1; 69 int idx = rx->q_num; 70 size_t bytes; 71 72 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 73 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 74 rx->desc.desc_ring = NULL; 75 76 dma_free_coherent(dev, sizeof(*rx->q_resources), 77 rx->q_resources, rx->q_resources_bus); 78 rx->q_resources = NULL; 79 80 gve_rx_unfill_pages(priv, rx, cfg); 81 82 bytes = sizeof(*rx->data.data_ring) * slots; 83 dma_free_coherent(dev, bytes, rx->data.data_ring, 84 rx->data.data_bus); 85 rx->data.data_ring = NULL; 86 87 kvfree(rx->qpl_copy_pool); 88 rx->qpl_copy_pool = NULL; 89 90 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 91 } 92 93 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 94 dma_addr_t addr, struct page *page, __be64 *slot_addr) 95 { 96 page_info->page = page; 97 page_info->page_offset = 0; 98 page_info->page_address = page_address(page); 99 *slot_addr = cpu_to_be64(addr); 100 /* The page already has 1 ref */ 101 page_ref_add(page, INT_MAX - 1); 102 page_info->pagecnt_bias = INT_MAX; 103 } 104 105 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 106 struct gve_rx_slot_page_info *page_info, 107 union gve_rx_data_slot *data_slot, 108 struct gve_rx_ring *rx) 109 { 110 struct page *page; 111 dma_addr_t dma; 112 int err; 113 114 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 115 GFP_ATOMIC); 116 if (err) { 117 u64_stats_update_begin(&rx->statss); 118 rx->rx_buf_alloc_fail++; 119 u64_stats_update_end(&rx->statss); 120 return err; 121 } 122 123 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 124 return 0; 125 } 126 127 static int gve_rx_prefill_pages(struct gve_rx_ring *rx, 128 struct gve_rx_alloc_rings_cfg *cfg) 129 { 130 struct gve_priv *priv = rx->gve; 131 u32 slots; 132 int err; 133 int i; 134 int j; 135 136 /* Allocate one page per Rx queue slot. Each page is split into two 137 * packet buffers, when possible we "page flip" between the two. 138 */ 139 slots = rx->mask + 1; 140 141 rx->data.page_info = kvzalloc(slots * 142 sizeof(*rx->data.page_info), GFP_KERNEL); 143 if (!rx->data.page_info) 144 return -ENOMEM; 145 146 if (!rx->data.raw_addressing) { 147 u32 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 148 149 rx->data.qpl = &cfg->qpls[qpl_id]; 150 } 151 152 for (i = 0; i < slots; i++) { 153 if (!rx->data.raw_addressing) { 154 struct page *page = rx->data.qpl->pages[i]; 155 dma_addr_t addr = i * PAGE_SIZE; 156 157 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 158 &rx->data.data_ring[i].qpl_offset); 159 continue; 160 } 161 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, 162 &rx->data.page_info[i], 163 &rx->data.data_ring[i], rx); 164 if (err) 165 goto alloc_err_rda; 166 } 167 168 if (!rx->data.raw_addressing) { 169 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 170 struct page *page = alloc_page(GFP_KERNEL); 171 172 if (!page) { 173 err = -ENOMEM; 174 goto alloc_err_qpl; 175 } 176 177 rx->qpl_copy_pool[j].page = page; 178 rx->qpl_copy_pool[j].page_offset = 0; 179 rx->qpl_copy_pool[j].page_address = page_address(page); 180 181 /* The page already has 1 ref. */ 182 page_ref_add(page, INT_MAX - 1); 183 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 184 } 185 } 186 187 return slots; 188 189 alloc_err_qpl: 190 /* Fully free the copy pool pages. */ 191 while (j--) { 192 page_ref_sub(rx->qpl_copy_pool[j].page, 193 rx->qpl_copy_pool[j].pagecnt_bias - 1); 194 put_page(rx->qpl_copy_pool[j].page); 195 } 196 197 /* Do not fully free QPL pages - only remove the bias added in this 198 * function with gve_setup_rx_buffer. 199 */ 200 while (i--) 201 page_ref_sub(rx->data.page_info[i].page, 202 rx->data.page_info[i].pagecnt_bias - 1); 203 204 rx->data.qpl = NULL; 205 206 return err; 207 208 alloc_err_rda: 209 while (i--) 210 gve_rx_free_buffer(&priv->pdev->dev, 211 &rx->data.page_info[i], 212 &rx->data.data_ring[i]); 213 return err; 214 } 215 216 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 217 { 218 ctx->skb_head = NULL; 219 ctx->skb_tail = NULL; 220 ctx->total_size = 0; 221 ctx->frag_cnt = 0; 222 ctx->drop_pkt = false; 223 } 224 225 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx) 226 { 227 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 228 229 gve_rx_add_to_block(priv, idx); 230 gve_add_napi(priv, ntfy_idx, gve_napi_poll); 231 } 232 233 static int gve_rx_alloc_ring_gqi(struct gve_priv *priv, 234 struct gve_rx_alloc_rings_cfg *cfg, 235 struct gve_rx_ring *rx, 236 int idx) 237 { 238 struct device *hdev = &priv->pdev->dev; 239 u32 slots = cfg->ring_size; 240 int filled_pages; 241 size_t bytes; 242 int err; 243 244 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 245 /* Make sure everything is zeroed to start with */ 246 memset(rx, 0, sizeof(*rx)); 247 248 rx->gve = priv; 249 rx->q_num = idx; 250 251 rx->mask = slots - 1; 252 rx->data.raw_addressing = cfg->raw_addressing; 253 254 /* alloc rx data ring */ 255 bytes = sizeof(*rx->data.data_ring) * slots; 256 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 257 &rx->data.data_bus, 258 GFP_KERNEL); 259 if (!rx->data.data_ring) 260 return -ENOMEM; 261 262 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 263 rx->qpl_copy_pool_head = 0; 264 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 265 sizeof(rx->qpl_copy_pool[0]), 266 GFP_KERNEL); 267 268 if (!rx->qpl_copy_pool) { 269 err = -ENOMEM; 270 goto abort_with_slots; 271 } 272 273 filled_pages = gve_rx_prefill_pages(rx, cfg); 274 if (filled_pages < 0) { 275 err = -ENOMEM; 276 goto abort_with_copy_pool; 277 } 278 rx->fill_cnt = filled_pages; 279 /* Ensure data ring slots (packet buffers) are visible. */ 280 dma_wmb(); 281 282 /* Alloc gve_queue_resources */ 283 rx->q_resources = 284 dma_alloc_coherent(hdev, 285 sizeof(*rx->q_resources), 286 &rx->q_resources_bus, 287 GFP_KERNEL); 288 if (!rx->q_resources) { 289 err = -ENOMEM; 290 goto abort_filled; 291 } 292 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 293 (unsigned long)rx->data.data_bus); 294 295 /* alloc rx desc ring */ 296 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 297 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 298 GFP_KERNEL); 299 if (!rx->desc.desc_ring) { 300 err = -ENOMEM; 301 goto abort_with_q_resources; 302 } 303 rx->cnt = 0; 304 rx->db_threshold = slots / 2; 305 rx->desc.seqno = 1; 306 307 rx->packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE; 308 gve_rx_ctx_clear(&rx->ctx); 309 310 return 0; 311 312 abort_with_q_resources: 313 dma_free_coherent(hdev, sizeof(*rx->q_resources), 314 rx->q_resources, rx->q_resources_bus); 315 rx->q_resources = NULL; 316 abort_filled: 317 gve_rx_unfill_pages(priv, rx, cfg); 318 abort_with_copy_pool: 319 kvfree(rx->qpl_copy_pool); 320 rx->qpl_copy_pool = NULL; 321 abort_with_slots: 322 bytes = sizeof(*rx->data.data_ring) * slots; 323 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 324 rx->data.data_ring = NULL; 325 326 return err; 327 } 328 329 int gve_rx_alloc_rings_gqi(struct gve_priv *priv, 330 struct gve_rx_alloc_rings_cfg *cfg) 331 { 332 struct gve_rx_ring *rx; 333 int err = 0; 334 int i, j; 335 336 if (!cfg->raw_addressing && !cfg->qpls) { 337 netif_err(priv, drv, priv->dev, 338 "Cannot alloc QPL ring before allocing QPLs\n"); 339 return -EINVAL; 340 } 341 342 rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring), 343 GFP_KERNEL); 344 if (!rx) 345 return -ENOMEM; 346 347 for (i = 0; i < cfg->qcfg->num_queues; i++) { 348 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i); 349 if (err) { 350 netif_err(priv, drv, priv->dev, 351 "Failed to alloc rx ring=%d: err=%d\n", 352 i, err); 353 goto cleanup; 354 } 355 } 356 357 cfg->rx = rx; 358 return 0; 359 360 cleanup: 361 for (j = 0; j < i; j++) 362 gve_rx_free_ring_gqi(priv, &rx[j], cfg); 363 kvfree(rx); 364 return err; 365 } 366 367 void gve_rx_free_rings_gqi(struct gve_priv *priv, 368 struct gve_rx_alloc_rings_cfg *cfg) 369 { 370 struct gve_rx_ring *rx = cfg->rx; 371 int i; 372 373 if (!rx) 374 return; 375 376 for (i = 0; i < cfg->qcfg->num_queues; i++) 377 gve_rx_free_ring_gqi(priv, &rx[i], cfg); 378 379 kvfree(rx); 380 cfg->rx = NULL; 381 } 382 383 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 384 { 385 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 386 387 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 388 } 389 390 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 391 { 392 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 393 return PKT_HASH_TYPE_L4; 394 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 395 return PKT_HASH_TYPE_L3; 396 return PKT_HASH_TYPE_L2; 397 } 398 399 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 400 struct gve_rx_slot_page_info *page_info, 401 unsigned int truesize, u16 len, 402 struct gve_rx_ctx *ctx) 403 { 404 u32 offset = page_info->page_offset + page_info->pad; 405 struct sk_buff *skb = ctx->skb_tail; 406 int num_frags = 0; 407 408 if (!skb) { 409 skb = napi_get_frags(napi); 410 if (unlikely(!skb)) 411 return NULL; 412 413 ctx->skb_head = skb; 414 ctx->skb_tail = skb; 415 } else { 416 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 417 if (num_frags == MAX_SKB_FRAGS) { 418 skb = napi_alloc_skb(napi, 0); 419 if (!skb) 420 return NULL; 421 422 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 423 // which is why we do not need to chain by using skb->next 424 skb_shinfo(ctx->skb_tail)->frag_list = skb; 425 426 ctx->skb_tail = skb; 427 num_frags = 0; 428 } 429 } 430 431 if (skb != ctx->skb_head) { 432 ctx->skb_head->len += len; 433 ctx->skb_head->data_len += len; 434 ctx->skb_head->truesize += truesize; 435 } 436 skb_add_rx_frag(skb, num_frags, page_info->page, 437 offset, len, truesize); 438 439 return ctx->skb_head; 440 } 441 442 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 443 { 444 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); 445 446 /* "flip" to other packet buffer on this page */ 447 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 448 *(slot_addr) ^= offset; 449 } 450 451 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 452 { 453 int pagecount = page_count(page_info->page); 454 455 /* This page is not being used by any SKBs - reuse */ 456 if (pagecount == page_info->pagecnt_bias) 457 return 1; 458 /* This page is still being used by an SKB - we can't reuse */ 459 else if (pagecount > page_info->pagecnt_bias) 460 return 0; 461 WARN(pagecount < page_info->pagecnt_bias, 462 "Pagecount should never be less than the bias."); 463 return -1; 464 } 465 466 static struct sk_buff * 467 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 468 struct gve_rx_slot_page_info *page_info, u16 len, 469 struct napi_struct *napi, 470 union gve_rx_data_slot *data_slot, 471 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 472 { 473 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 474 475 if (!skb) 476 return NULL; 477 478 /* Optimistically stop the kernel from freeing the page. 479 * We will check again in refill to determine if we need to alloc a 480 * new page. 481 */ 482 gve_dec_pagecnt_bias(page_info); 483 484 return skb; 485 } 486 487 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 488 struct gve_rx_slot_page_info *page_info, 489 u16 len, struct napi_struct *napi) 490 { 491 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 492 void *src = page_info->page_address + page_info->page_offset; 493 struct gve_rx_slot_page_info *copy_page_info; 494 struct gve_rx_ctx *ctx = &rx->ctx; 495 bool alloc_page = false; 496 struct sk_buff *skb; 497 void *dst; 498 499 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 500 if (!copy_page_info->can_flip) { 501 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 502 503 if (unlikely(recycle < 0)) { 504 gve_schedule_reset(rx->gve); 505 return NULL; 506 } 507 alloc_page = !recycle; 508 } 509 510 if (alloc_page) { 511 struct gve_rx_slot_page_info alloc_page_info; 512 struct page *page; 513 514 /* The least recently used page turned out to be 515 * still in use by the kernel. Ignoring it and moving 516 * on alleviates head-of-line blocking. 517 */ 518 rx->qpl_copy_pool_head++; 519 520 page = alloc_page(GFP_ATOMIC); 521 if (!page) 522 return NULL; 523 524 alloc_page_info.page = page; 525 alloc_page_info.page_offset = 0; 526 alloc_page_info.page_address = page_address(page); 527 alloc_page_info.pad = page_info->pad; 528 529 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 530 skb = gve_rx_add_frags(napi, &alloc_page_info, 531 PAGE_SIZE, 532 len, ctx); 533 534 u64_stats_update_begin(&rx->statss); 535 rx->rx_frag_copy_cnt++; 536 rx->rx_frag_alloc_cnt++; 537 u64_stats_update_end(&rx->statss); 538 539 return skb; 540 } 541 542 dst = copy_page_info->page_address + copy_page_info->page_offset; 543 memcpy(dst, src, page_info->pad + len); 544 copy_page_info->pad = page_info->pad; 545 546 skb = gve_rx_add_frags(napi, copy_page_info, 547 rx->packet_buffer_size, len, ctx); 548 if (unlikely(!skb)) 549 return NULL; 550 551 gve_dec_pagecnt_bias(copy_page_info); 552 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 553 554 if (copy_page_info->can_flip) { 555 /* We have used both halves of this copy page, it 556 * is time for it to go to the back of the queue. 557 */ 558 copy_page_info->can_flip = false; 559 rx->qpl_copy_pool_head++; 560 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 561 } else { 562 copy_page_info->can_flip = true; 563 } 564 565 u64_stats_update_begin(&rx->statss); 566 rx->rx_frag_copy_cnt++; 567 u64_stats_update_end(&rx->statss); 568 569 return skb; 570 } 571 572 static struct sk_buff * 573 gve_rx_qpl(struct device *dev, struct net_device *netdev, 574 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 575 u16 len, struct napi_struct *napi, 576 union gve_rx_data_slot *data_slot) 577 { 578 struct gve_rx_ctx *ctx = &rx->ctx; 579 struct sk_buff *skb; 580 581 /* if raw_addressing mode is not enabled gvnic can only receive into 582 * registered segments. If the buffer can't be recycled, our only 583 * choice is to copy the data out of it so that we can return it to the 584 * device. 585 */ 586 if (page_info->can_flip) { 587 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 588 /* No point in recycling if we didn't get the skb */ 589 if (skb) { 590 /* Make sure that the page isn't freed. */ 591 gve_dec_pagecnt_bias(page_info); 592 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 593 } 594 } else { 595 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 596 } 597 return skb; 598 } 599 600 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 601 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 602 u16 len, union gve_rx_data_slot *data_slot, 603 bool is_only_frag) 604 { 605 struct net_device *netdev = priv->dev; 606 struct gve_rx_ctx *ctx = &rx->ctx; 607 struct sk_buff *skb = NULL; 608 609 if (len <= priv->rx_copybreak && is_only_frag) { 610 /* Just copy small packets */ 611 skb = gve_rx_copy(netdev, napi, page_info, len); 612 if (skb) { 613 u64_stats_update_begin(&rx->statss); 614 rx->rx_copied_pkt++; 615 rx->rx_frag_copy_cnt++; 616 rx->rx_copybreak_pkt++; 617 u64_stats_update_end(&rx->statss); 618 } 619 } else { 620 int recycle = gve_rx_can_recycle_buffer(page_info); 621 622 if (unlikely(recycle < 0)) { 623 gve_schedule_reset(priv); 624 return NULL; 625 } 626 page_info->can_flip = recycle; 627 if (page_info->can_flip) { 628 u64_stats_update_begin(&rx->statss); 629 rx->rx_frag_flip_cnt++; 630 u64_stats_update_end(&rx->statss); 631 } 632 633 if (rx->data.raw_addressing) { 634 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 635 page_info, len, napi, 636 data_slot, 637 rx->packet_buffer_size, ctx); 638 } else { 639 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 640 page_info, len, napi, data_slot); 641 } 642 } 643 return skb; 644 } 645 646 static int gve_xsk_pool_redirect(struct net_device *dev, 647 struct gve_rx_ring *rx, 648 void *data, int len, 649 struct bpf_prog *xdp_prog) 650 { 651 struct xdp_buff *xdp; 652 int err; 653 654 if (rx->xsk_pool->frame_len < len) 655 return -E2BIG; 656 xdp = xsk_buff_alloc(rx->xsk_pool); 657 if (!xdp) { 658 u64_stats_update_begin(&rx->statss); 659 rx->xdp_alloc_fails++; 660 u64_stats_update_end(&rx->statss); 661 return -ENOMEM; 662 } 663 xdp->data_end = xdp->data + len; 664 memcpy(xdp->data, data, len); 665 err = xdp_do_redirect(dev, xdp, xdp_prog); 666 if (err) 667 xsk_buff_free(xdp); 668 return err; 669 } 670 671 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 672 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 673 { 674 int total_len, len = orig->data_end - orig->data; 675 int headroom = XDP_PACKET_HEADROOM; 676 struct xdp_buff new; 677 void *frame; 678 int err; 679 680 if (rx->xsk_pool) 681 return gve_xsk_pool_redirect(dev, rx, orig->data, 682 len, xdp_prog); 683 684 total_len = headroom + SKB_DATA_ALIGN(len) + 685 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 686 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 687 if (!frame) { 688 u64_stats_update_begin(&rx->statss); 689 rx->xdp_alloc_fails++; 690 u64_stats_update_end(&rx->statss); 691 return -ENOMEM; 692 } 693 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 694 xdp_prepare_buff(&new, frame, headroom, len, false); 695 memcpy(new.data, orig->data, len); 696 697 err = xdp_do_redirect(dev, &new, xdp_prog); 698 if (err) 699 page_frag_free(frame); 700 701 return err; 702 } 703 704 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 705 struct xdp_buff *xdp, struct bpf_prog *xprog, 706 int xdp_act) 707 { 708 struct gve_tx_ring *tx; 709 int tx_qid; 710 int err; 711 712 switch (xdp_act) { 713 case XDP_ABORTED: 714 case XDP_DROP: 715 default: 716 break; 717 case XDP_TX: 718 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 719 tx = &priv->tx[tx_qid]; 720 spin_lock(&tx->xdp_lock); 721 err = gve_xdp_xmit_one(priv, tx, xdp->data, 722 xdp->data_end - xdp->data, NULL); 723 spin_unlock(&tx->xdp_lock); 724 725 if (unlikely(err)) { 726 u64_stats_update_begin(&rx->statss); 727 rx->xdp_tx_errors++; 728 u64_stats_update_end(&rx->statss); 729 } 730 break; 731 case XDP_REDIRECT: 732 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 733 734 if (unlikely(err)) { 735 u64_stats_update_begin(&rx->statss); 736 rx->xdp_redirect_errors++; 737 u64_stats_update_end(&rx->statss); 738 } 739 break; 740 } 741 u64_stats_update_begin(&rx->statss); 742 if ((u32)xdp_act < GVE_XDP_ACTIONS) 743 rx->xdp_actions[xdp_act]++; 744 u64_stats_update_end(&rx->statss); 745 } 746 747 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 748 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 749 struct gve_rx_desc *desc, u32 idx, 750 struct gve_rx_cnts *cnts) 751 { 752 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 753 struct gve_rx_slot_page_info *page_info; 754 u16 frag_size = be16_to_cpu(desc->len); 755 struct gve_rx_ctx *ctx = &rx->ctx; 756 union gve_rx_data_slot *data_slot; 757 struct gve_priv *priv = rx->gve; 758 struct sk_buff *skb = NULL; 759 struct bpf_prog *xprog; 760 struct xdp_buff xdp; 761 dma_addr_t page_bus; 762 void *va; 763 764 u16 len = frag_size; 765 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 766 bool is_first_frag = ctx->frag_cnt == 0; 767 768 bool is_only_frag = is_first_frag && is_last_frag; 769 770 if (unlikely(ctx->drop_pkt)) 771 goto finish_frag; 772 773 if (desc->flags_seq & GVE_RXF_ERR) { 774 ctx->drop_pkt = true; 775 cnts->desc_err_pkt_cnt++; 776 napi_free_frags(napi); 777 goto finish_frag; 778 } 779 780 if (unlikely(frag_size > rx->packet_buffer_size)) { 781 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 782 frag_size, rx->packet_buffer_size); 783 ctx->drop_pkt = true; 784 napi_free_frags(napi); 785 gve_schedule_reset(rx->gve); 786 goto finish_frag; 787 } 788 789 /* Prefetch two packet buffers ahead, we will need it soon. */ 790 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 791 va = page_info->page_address + page_info->page_offset; 792 prefetch(page_info->page); /* Kernel page struct. */ 793 prefetch(va); /* Packet header. */ 794 prefetch(va + 64); /* Next cacheline too. */ 795 796 page_info = &rx->data.page_info[idx]; 797 data_slot = &rx->data.data_ring[idx]; 798 page_bus = (rx->data.raw_addressing) ? 799 be64_to_cpu(data_slot->addr) - page_info->page_offset : 800 rx->data.qpl->page_buses[idx]; 801 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 802 PAGE_SIZE, DMA_FROM_DEVICE); 803 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 804 len -= page_info->pad; 805 frag_size -= page_info->pad; 806 807 xprog = READ_ONCE(priv->xdp_prog); 808 if (xprog && is_only_frag) { 809 void *old_data; 810 int xdp_act; 811 812 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 813 xdp_prepare_buff(&xdp, page_info->page_address + 814 page_info->page_offset, GVE_RX_PAD, 815 len, false); 816 old_data = xdp.data; 817 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 818 if (xdp_act != XDP_PASS) { 819 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 820 ctx->total_size += frag_size; 821 goto finish_ok_pkt; 822 } 823 824 page_info->pad += xdp.data - old_data; 825 len = xdp.data_end - xdp.data; 826 827 u64_stats_update_begin(&rx->statss); 828 rx->xdp_actions[XDP_PASS]++; 829 u64_stats_update_end(&rx->statss); 830 } 831 832 skb = gve_rx_skb(priv, rx, page_info, napi, len, 833 data_slot, is_only_frag); 834 if (!skb) { 835 u64_stats_update_begin(&rx->statss); 836 rx->rx_skb_alloc_fail++; 837 u64_stats_update_end(&rx->statss); 838 839 napi_free_frags(napi); 840 ctx->drop_pkt = true; 841 goto finish_frag; 842 } 843 ctx->total_size += frag_size; 844 845 if (is_first_frag) { 846 if (likely(feat & NETIF_F_RXCSUM)) { 847 /* NIC passes up the partial sum */ 848 if (desc->csum) 849 skb->ip_summed = CHECKSUM_COMPLETE; 850 else 851 skb->ip_summed = CHECKSUM_NONE; 852 skb->csum = csum_unfold(desc->csum); 853 } 854 855 /* parse flags & pass relevant info up */ 856 if (likely(feat & NETIF_F_RXHASH) && 857 gve_needs_rss(desc->flags_seq)) 858 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 859 gve_rss_type(desc->flags_seq)); 860 } 861 862 if (is_last_frag) { 863 skb_record_rx_queue(skb, rx->q_num); 864 if (skb_is_nonlinear(skb)) 865 napi_gro_frags(napi); 866 else 867 napi_gro_receive(napi, skb); 868 goto finish_ok_pkt; 869 } 870 871 goto finish_frag; 872 873 finish_ok_pkt: 874 cnts->ok_pkt_bytes += ctx->total_size; 875 cnts->ok_pkt_cnt++; 876 finish_frag: 877 ctx->frag_cnt++; 878 if (is_last_frag) { 879 cnts->total_pkt_cnt++; 880 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 881 gve_rx_ctx_clear(ctx); 882 } 883 } 884 885 bool gve_rx_work_pending(struct gve_rx_ring *rx) 886 { 887 struct gve_rx_desc *desc; 888 __be16 flags_seq; 889 u32 next_idx; 890 891 next_idx = rx->cnt & rx->mask; 892 desc = rx->desc.desc_ring + next_idx; 893 894 flags_seq = desc->flags_seq; 895 896 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 897 } 898 899 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 900 { 901 int refill_target = rx->mask + 1; 902 u32 fill_cnt = rx->fill_cnt; 903 904 while (fill_cnt - rx->cnt < refill_target) { 905 struct gve_rx_slot_page_info *page_info; 906 u32 idx = fill_cnt & rx->mask; 907 908 page_info = &rx->data.page_info[idx]; 909 if (page_info->can_flip) { 910 /* The other half of the page is free because it was 911 * free when we processed the descriptor. Flip to it. 912 */ 913 union gve_rx_data_slot *data_slot = 914 &rx->data.data_ring[idx]; 915 916 gve_rx_flip_buff(page_info, &data_slot->addr); 917 page_info->can_flip = 0; 918 } else { 919 /* It is possible that the networking stack has already 920 * finished processing all outstanding packets in the buffer 921 * and it can be reused. 922 * Flipping is unnecessary here - if the networking stack still 923 * owns half the page it is impossible to tell which half. Either 924 * the whole page is free or it needs to be replaced. 925 */ 926 int recycle = gve_rx_can_recycle_buffer(page_info); 927 928 if (recycle < 0) { 929 if (!rx->data.raw_addressing) 930 gve_schedule_reset(priv); 931 return false; 932 } 933 if (!recycle) { 934 /* We can't reuse the buffer - alloc a new one*/ 935 union gve_rx_data_slot *data_slot = 936 &rx->data.data_ring[idx]; 937 struct device *dev = &priv->pdev->dev; 938 gve_rx_free_buffer(dev, page_info, data_slot); 939 page_info->page = NULL; 940 if (gve_rx_alloc_buffer(priv, dev, page_info, 941 data_slot, rx)) { 942 break; 943 } 944 } 945 } 946 fill_cnt++; 947 } 948 rx->fill_cnt = fill_cnt; 949 return true; 950 } 951 952 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 953 netdev_features_t feat) 954 { 955 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 956 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 957 struct gve_rx_ctx *ctx = &rx->ctx; 958 struct gve_priv *priv = rx->gve; 959 struct gve_rx_cnts cnts = {0}; 960 struct gve_rx_desc *next_desc; 961 u32 idx = rx->cnt & rx->mask; 962 u32 work_done = 0; 963 964 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 965 966 // Exceed budget only if (and till) the inflight packet is consumed. 967 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 968 (work_done < budget || ctx->frag_cnt)) { 969 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 970 prefetch(next_desc); 971 972 gve_rx(rx, feat, desc, idx, &cnts); 973 974 rx->cnt++; 975 idx = rx->cnt & rx->mask; 976 desc = &rx->desc.desc_ring[idx]; 977 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 978 work_done++; 979 } 980 981 // The device will only send whole packets. 982 if (unlikely(ctx->frag_cnt)) { 983 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 984 985 napi_free_frags(napi); 986 gve_rx_ctx_clear(&rx->ctx); 987 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 988 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 989 gve_schedule_reset(rx->gve); 990 } 991 992 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 993 return 0; 994 995 if (work_done) { 996 u64_stats_update_begin(&rx->statss); 997 rx->rpackets += cnts.ok_pkt_cnt; 998 rx->rbytes += cnts.ok_pkt_bytes; 999 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 1000 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 1001 u64_stats_update_end(&rx->statss); 1002 } 1003 1004 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1005 gve_xdp_tx_flush(priv, rx->q_num); 1006 1007 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1008 xdp_do_flush(); 1009 1010 /* restock ring slots */ 1011 if (!rx->data.raw_addressing) { 1012 /* In QPL mode buffs are refilled as the desc are processed */ 1013 rx->fill_cnt += work_done; 1014 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1015 /* In raw addressing mode buffs are only refilled if the avail 1016 * falls below a threshold. 1017 */ 1018 if (!gve_rx_refill_buffers(priv, rx)) 1019 return 0; 1020 1021 /* If we were not able to completely refill buffers, we'll want 1022 * to schedule this queue for work again to refill buffers. 1023 */ 1024 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1025 gve_rx_write_doorbell(priv, rx); 1026 return budget; 1027 } 1028 } 1029 1030 gve_rx_write_doorbell(priv, rx); 1031 return cnts.total_pkt_cnt; 1032 } 1033 1034 int gve_rx_poll(struct gve_notify_block *block, int budget) 1035 { 1036 struct gve_rx_ring *rx = block->rx; 1037 netdev_features_t feat; 1038 int work_done = 0; 1039 1040 feat = block->napi.dev->features; 1041 1042 if (budget > 0) 1043 work_done = gve_clean_rx_done(rx, budget, feat); 1044 1045 return work_done; 1046 } 1047