1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, 27 struct gve_rx_ring *rx, 28 struct gve_rx_alloc_rings_cfg *cfg) 29 { 30 u32 slots = rx->mask + 1; 31 int i; 32 33 if (rx->data.raw_addressing) { 34 for (i = 0; i < slots; i++) 35 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 36 &rx->data.data_ring[i]); 37 } else { 38 for (i = 0; i < slots; i++) 39 page_ref_sub(rx->data.page_info[i].page, 40 rx->data.page_info[i].pagecnt_bias - 1); 41 gve_unassign_qpl(cfg->qpl_cfg, rx->data.qpl->id); 42 rx->data.qpl = NULL; 43 44 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 45 page_ref_sub(rx->qpl_copy_pool[i].page, 46 rx->qpl_copy_pool[i].pagecnt_bias - 1); 47 put_page(rx->qpl_copy_pool[i].page); 48 } 49 } 50 kvfree(rx->data.page_info); 51 rx->data.page_info = NULL; 52 } 53 54 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx) 55 { 56 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 57 58 if (!gve_rx_was_added_to_block(priv, idx)) 59 return; 60 61 gve_remove_napi(priv, ntfy_idx); 62 gve_rx_remove_from_block(priv, idx); 63 } 64 65 static void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx, 66 struct gve_rx_alloc_rings_cfg *cfg) 67 { 68 struct device *dev = &priv->pdev->dev; 69 u32 slots = rx->mask + 1; 70 int idx = rx->q_num; 71 size_t bytes; 72 73 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 74 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 75 rx->desc.desc_ring = NULL; 76 77 dma_free_coherent(dev, sizeof(*rx->q_resources), 78 rx->q_resources, rx->q_resources_bus); 79 rx->q_resources = NULL; 80 81 gve_rx_unfill_pages(priv, rx, cfg); 82 83 bytes = sizeof(*rx->data.data_ring) * slots; 84 dma_free_coherent(dev, bytes, rx->data.data_ring, 85 rx->data.data_bus); 86 rx->data.data_ring = NULL; 87 88 kvfree(rx->qpl_copy_pool); 89 rx->qpl_copy_pool = NULL; 90 91 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 92 } 93 94 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 95 dma_addr_t addr, struct page *page, __be64 *slot_addr) 96 { 97 page_info->page = page; 98 page_info->page_offset = 0; 99 page_info->page_address = page_address(page); 100 *slot_addr = cpu_to_be64(addr); 101 /* The page already has 1 ref */ 102 page_ref_add(page, INT_MAX - 1); 103 page_info->pagecnt_bias = INT_MAX; 104 } 105 106 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 107 struct gve_rx_slot_page_info *page_info, 108 union gve_rx_data_slot *data_slot, 109 struct gve_rx_ring *rx) 110 { 111 struct page *page; 112 dma_addr_t dma; 113 int err; 114 115 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 116 GFP_ATOMIC); 117 if (err) { 118 u64_stats_update_begin(&rx->statss); 119 rx->rx_buf_alloc_fail++; 120 u64_stats_update_end(&rx->statss); 121 return err; 122 } 123 124 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 125 return 0; 126 } 127 128 static int gve_rx_prefill_pages(struct gve_rx_ring *rx, 129 struct gve_rx_alloc_rings_cfg *cfg) 130 { 131 struct gve_priv *priv = rx->gve; 132 u32 slots; 133 int err; 134 int i; 135 int j; 136 137 /* Allocate one page per Rx queue slot. Each page is split into two 138 * packet buffers, when possible we "page flip" between the two. 139 */ 140 slots = rx->mask + 1; 141 142 rx->data.page_info = kvzalloc(slots * 143 sizeof(*rx->data.page_info), GFP_KERNEL); 144 if (!rx->data.page_info) 145 return -ENOMEM; 146 147 if (!rx->data.raw_addressing) { 148 rx->data.qpl = gve_assign_rx_qpl(cfg, rx->q_num); 149 if (!rx->data.qpl) { 150 kvfree(rx->data.page_info); 151 rx->data.page_info = NULL; 152 return -ENOMEM; 153 } 154 } 155 for (i = 0; i < slots; i++) { 156 if (!rx->data.raw_addressing) { 157 struct page *page = rx->data.qpl->pages[i]; 158 dma_addr_t addr = i * PAGE_SIZE; 159 160 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 161 &rx->data.data_ring[i].qpl_offset); 162 continue; 163 } 164 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, 165 &rx->data.page_info[i], 166 &rx->data.data_ring[i], rx); 167 if (err) 168 goto alloc_err_rda; 169 } 170 171 if (!rx->data.raw_addressing) { 172 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 173 struct page *page = alloc_page(GFP_KERNEL); 174 175 if (!page) { 176 err = -ENOMEM; 177 goto alloc_err_qpl; 178 } 179 180 rx->qpl_copy_pool[j].page = page; 181 rx->qpl_copy_pool[j].page_offset = 0; 182 rx->qpl_copy_pool[j].page_address = page_address(page); 183 184 /* The page already has 1 ref. */ 185 page_ref_add(page, INT_MAX - 1); 186 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 187 } 188 } 189 190 return slots; 191 192 alloc_err_qpl: 193 /* Fully free the copy pool pages. */ 194 while (j--) { 195 page_ref_sub(rx->qpl_copy_pool[j].page, 196 rx->qpl_copy_pool[j].pagecnt_bias - 1); 197 put_page(rx->qpl_copy_pool[j].page); 198 } 199 200 /* Do not fully free QPL pages - only remove the bias added in this 201 * function with gve_setup_rx_buffer. 202 */ 203 while (i--) 204 page_ref_sub(rx->data.page_info[i].page, 205 rx->data.page_info[i].pagecnt_bias - 1); 206 207 gve_unassign_qpl(cfg->qpl_cfg, rx->data.qpl->id); 208 rx->data.qpl = NULL; 209 210 return err; 211 212 alloc_err_rda: 213 while (i--) 214 gve_rx_free_buffer(&priv->pdev->dev, 215 &rx->data.page_info[i], 216 &rx->data.data_ring[i]); 217 return err; 218 } 219 220 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 221 { 222 ctx->skb_head = NULL; 223 ctx->skb_tail = NULL; 224 ctx->total_size = 0; 225 ctx->frag_cnt = 0; 226 ctx->drop_pkt = false; 227 } 228 229 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx) 230 { 231 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 232 233 gve_rx_add_to_block(priv, idx); 234 gve_add_napi(priv, ntfy_idx, gve_napi_poll); 235 } 236 237 static int gve_rx_alloc_ring_gqi(struct gve_priv *priv, 238 struct gve_rx_alloc_rings_cfg *cfg, 239 struct gve_rx_ring *rx, 240 int idx) 241 { 242 struct device *hdev = &priv->pdev->dev; 243 u32 slots = priv->rx_data_slot_cnt; 244 int filled_pages; 245 size_t bytes; 246 int err; 247 248 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 249 /* Make sure everything is zeroed to start with */ 250 memset(rx, 0, sizeof(*rx)); 251 252 rx->gve = priv; 253 rx->q_num = idx; 254 255 rx->mask = slots - 1; 256 rx->data.raw_addressing = cfg->raw_addressing; 257 258 /* alloc rx data ring */ 259 bytes = sizeof(*rx->data.data_ring) * slots; 260 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 261 &rx->data.data_bus, 262 GFP_KERNEL); 263 if (!rx->data.data_ring) 264 return -ENOMEM; 265 266 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 267 rx->qpl_copy_pool_head = 0; 268 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 269 sizeof(rx->qpl_copy_pool[0]), 270 GFP_KERNEL); 271 272 if (!rx->qpl_copy_pool) { 273 err = -ENOMEM; 274 goto abort_with_slots; 275 } 276 277 filled_pages = gve_rx_prefill_pages(rx, cfg); 278 if (filled_pages < 0) { 279 err = -ENOMEM; 280 goto abort_with_copy_pool; 281 } 282 rx->fill_cnt = filled_pages; 283 /* Ensure data ring slots (packet buffers) are visible. */ 284 dma_wmb(); 285 286 /* Alloc gve_queue_resources */ 287 rx->q_resources = 288 dma_alloc_coherent(hdev, 289 sizeof(*rx->q_resources), 290 &rx->q_resources_bus, 291 GFP_KERNEL); 292 if (!rx->q_resources) { 293 err = -ENOMEM; 294 goto abort_filled; 295 } 296 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 297 (unsigned long)rx->data.data_bus); 298 299 /* alloc rx desc ring */ 300 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 301 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 302 GFP_KERNEL); 303 if (!rx->desc.desc_ring) { 304 err = -ENOMEM; 305 goto abort_with_q_resources; 306 } 307 rx->cnt = 0; 308 rx->db_threshold = slots / 2; 309 rx->desc.seqno = 1; 310 311 rx->packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE; 312 gve_rx_ctx_clear(&rx->ctx); 313 314 return 0; 315 316 abort_with_q_resources: 317 dma_free_coherent(hdev, sizeof(*rx->q_resources), 318 rx->q_resources, rx->q_resources_bus); 319 rx->q_resources = NULL; 320 abort_filled: 321 gve_rx_unfill_pages(priv, rx, cfg); 322 abort_with_copy_pool: 323 kvfree(rx->qpl_copy_pool); 324 rx->qpl_copy_pool = NULL; 325 abort_with_slots: 326 bytes = sizeof(*rx->data.data_ring) * slots; 327 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 328 rx->data.data_ring = NULL; 329 330 return err; 331 } 332 333 int gve_rx_alloc_rings_gqi(struct gve_priv *priv, 334 struct gve_rx_alloc_rings_cfg *cfg) 335 { 336 struct gve_rx_ring *rx; 337 int err = 0; 338 int i, j; 339 340 if (!cfg->raw_addressing && !cfg->qpls) { 341 netif_err(priv, drv, priv->dev, 342 "Cannot alloc QPL ring before allocing QPLs\n"); 343 return -EINVAL; 344 } 345 346 rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring), 347 GFP_KERNEL); 348 if (!rx) 349 return -ENOMEM; 350 351 for (i = 0; i < cfg->qcfg->num_queues; i++) { 352 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i); 353 if (err) { 354 netif_err(priv, drv, priv->dev, 355 "Failed to alloc rx ring=%d: err=%d\n", 356 i, err); 357 goto cleanup; 358 } 359 } 360 361 cfg->rx = rx; 362 return 0; 363 364 cleanup: 365 for (j = 0; j < i; j++) 366 gve_rx_free_ring_gqi(priv, &rx[j], cfg); 367 kvfree(rx); 368 return err; 369 } 370 371 void gve_rx_free_rings_gqi(struct gve_priv *priv, 372 struct gve_rx_alloc_rings_cfg *cfg) 373 { 374 struct gve_rx_ring *rx = cfg->rx; 375 int i; 376 377 if (!rx) 378 return; 379 380 for (i = 0; i < cfg->qcfg->num_queues; i++) 381 gve_rx_free_ring_gqi(priv, &rx[i], cfg); 382 383 kvfree(rx); 384 cfg->rx = NULL; 385 } 386 387 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 388 { 389 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 390 391 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 392 } 393 394 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 395 { 396 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 397 return PKT_HASH_TYPE_L4; 398 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 399 return PKT_HASH_TYPE_L3; 400 return PKT_HASH_TYPE_L2; 401 } 402 403 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 404 struct gve_rx_slot_page_info *page_info, 405 unsigned int truesize, u16 len, 406 struct gve_rx_ctx *ctx) 407 { 408 u32 offset = page_info->page_offset + page_info->pad; 409 struct sk_buff *skb = ctx->skb_tail; 410 int num_frags = 0; 411 412 if (!skb) { 413 skb = napi_get_frags(napi); 414 if (unlikely(!skb)) 415 return NULL; 416 417 ctx->skb_head = skb; 418 ctx->skb_tail = skb; 419 } else { 420 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 421 if (num_frags == MAX_SKB_FRAGS) { 422 skb = napi_alloc_skb(napi, 0); 423 if (!skb) 424 return NULL; 425 426 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 427 // which is why we do not need to chain by using skb->next 428 skb_shinfo(ctx->skb_tail)->frag_list = skb; 429 430 ctx->skb_tail = skb; 431 num_frags = 0; 432 } 433 } 434 435 if (skb != ctx->skb_head) { 436 ctx->skb_head->len += len; 437 ctx->skb_head->data_len += len; 438 ctx->skb_head->truesize += truesize; 439 } 440 skb_add_rx_frag(skb, num_frags, page_info->page, 441 offset, len, truesize); 442 443 return ctx->skb_head; 444 } 445 446 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 447 { 448 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); 449 450 /* "flip" to other packet buffer on this page */ 451 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 452 *(slot_addr) ^= offset; 453 } 454 455 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 456 { 457 int pagecount = page_count(page_info->page); 458 459 /* This page is not being used by any SKBs - reuse */ 460 if (pagecount == page_info->pagecnt_bias) 461 return 1; 462 /* This page is still being used by an SKB - we can't reuse */ 463 else if (pagecount > page_info->pagecnt_bias) 464 return 0; 465 WARN(pagecount < page_info->pagecnt_bias, 466 "Pagecount should never be less than the bias."); 467 return -1; 468 } 469 470 static struct sk_buff * 471 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 472 struct gve_rx_slot_page_info *page_info, u16 len, 473 struct napi_struct *napi, 474 union gve_rx_data_slot *data_slot, 475 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 476 { 477 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 478 479 if (!skb) 480 return NULL; 481 482 /* Optimistically stop the kernel from freeing the page. 483 * We will check again in refill to determine if we need to alloc a 484 * new page. 485 */ 486 gve_dec_pagecnt_bias(page_info); 487 488 return skb; 489 } 490 491 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 492 struct gve_rx_slot_page_info *page_info, 493 u16 len, struct napi_struct *napi) 494 { 495 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 496 void *src = page_info->page_address + page_info->page_offset; 497 struct gve_rx_slot_page_info *copy_page_info; 498 struct gve_rx_ctx *ctx = &rx->ctx; 499 bool alloc_page = false; 500 struct sk_buff *skb; 501 void *dst; 502 503 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 504 if (!copy_page_info->can_flip) { 505 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 506 507 if (unlikely(recycle < 0)) { 508 gve_schedule_reset(rx->gve); 509 return NULL; 510 } 511 alloc_page = !recycle; 512 } 513 514 if (alloc_page) { 515 struct gve_rx_slot_page_info alloc_page_info; 516 struct page *page; 517 518 /* The least recently used page turned out to be 519 * still in use by the kernel. Ignoring it and moving 520 * on alleviates head-of-line blocking. 521 */ 522 rx->qpl_copy_pool_head++; 523 524 page = alloc_page(GFP_ATOMIC); 525 if (!page) 526 return NULL; 527 528 alloc_page_info.page = page; 529 alloc_page_info.page_offset = 0; 530 alloc_page_info.page_address = page_address(page); 531 alloc_page_info.pad = page_info->pad; 532 533 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 534 skb = gve_rx_add_frags(napi, &alloc_page_info, 535 PAGE_SIZE, 536 len, ctx); 537 538 u64_stats_update_begin(&rx->statss); 539 rx->rx_frag_copy_cnt++; 540 rx->rx_frag_alloc_cnt++; 541 u64_stats_update_end(&rx->statss); 542 543 return skb; 544 } 545 546 dst = copy_page_info->page_address + copy_page_info->page_offset; 547 memcpy(dst, src, page_info->pad + len); 548 copy_page_info->pad = page_info->pad; 549 550 skb = gve_rx_add_frags(napi, copy_page_info, 551 rx->packet_buffer_size, len, ctx); 552 if (unlikely(!skb)) 553 return NULL; 554 555 gve_dec_pagecnt_bias(copy_page_info); 556 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 557 558 if (copy_page_info->can_flip) { 559 /* We have used both halves of this copy page, it 560 * is time for it to go to the back of the queue. 561 */ 562 copy_page_info->can_flip = false; 563 rx->qpl_copy_pool_head++; 564 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 565 } else { 566 copy_page_info->can_flip = true; 567 } 568 569 u64_stats_update_begin(&rx->statss); 570 rx->rx_frag_copy_cnt++; 571 u64_stats_update_end(&rx->statss); 572 573 return skb; 574 } 575 576 static struct sk_buff * 577 gve_rx_qpl(struct device *dev, struct net_device *netdev, 578 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 579 u16 len, struct napi_struct *napi, 580 union gve_rx_data_slot *data_slot) 581 { 582 struct gve_rx_ctx *ctx = &rx->ctx; 583 struct sk_buff *skb; 584 585 /* if raw_addressing mode is not enabled gvnic can only receive into 586 * registered segments. If the buffer can't be recycled, our only 587 * choice is to copy the data out of it so that we can return it to the 588 * device. 589 */ 590 if (page_info->can_flip) { 591 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 592 /* No point in recycling if we didn't get the skb */ 593 if (skb) { 594 /* Make sure that the page isn't freed. */ 595 gve_dec_pagecnt_bias(page_info); 596 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 597 } 598 } else { 599 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 600 } 601 return skb; 602 } 603 604 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 605 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 606 u16 len, union gve_rx_data_slot *data_slot, 607 bool is_only_frag) 608 { 609 struct net_device *netdev = priv->dev; 610 struct gve_rx_ctx *ctx = &rx->ctx; 611 struct sk_buff *skb = NULL; 612 613 if (len <= priv->rx_copybreak && is_only_frag) { 614 /* Just copy small packets */ 615 skb = gve_rx_copy(netdev, napi, page_info, len); 616 if (skb) { 617 u64_stats_update_begin(&rx->statss); 618 rx->rx_copied_pkt++; 619 rx->rx_frag_copy_cnt++; 620 rx->rx_copybreak_pkt++; 621 u64_stats_update_end(&rx->statss); 622 } 623 } else { 624 int recycle = gve_rx_can_recycle_buffer(page_info); 625 626 if (unlikely(recycle < 0)) { 627 gve_schedule_reset(priv); 628 return NULL; 629 } 630 page_info->can_flip = recycle; 631 if (page_info->can_flip) { 632 u64_stats_update_begin(&rx->statss); 633 rx->rx_frag_flip_cnt++; 634 u64_stats_update_end(&rx->statss); 635 } 636 637 if (rx->data.raw_addressing) { 638 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 639 page_info, len, napi, 640 data_slot, 641 rx->packet_buffer_size, ctx); 642 } else { 643 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 644 page_info, len, napi, data_slot); 645 } 646 } 647 return skb; 648 } 649 650 static int gve_xsk_pool_redirect(struct net_device *dev, 651 struct gve_rx_ring *rx, 652 void *data, int len, 653 struct bpf_prog *xdp_prog) 654 { 655 struct xdp_buff *xdp; 656 int err; 657 658 if (rx->xsk_pool->frame_len < len) 659 return -E2BIG; 660 xdp = xsk_buff_alloc(rx->xsk_pool); 661 if (!xdp) { 662 u64_stats_update_begin(&rx->statss); 663 rx->xdp_alloc_fails++; 664 u64_stats_update_end(&rx->statss); 665 return -ENOMEM; 666 } 667 xdp->data_end = xdp->data + len; 668 memcpy(xdp->data, data, len); 669 err = xdp_do_redirect(dev, xdp, xdp_prog); 670 if (err) 671 xsk_buff_free(xdp); 672 return err; 673 } 674 675 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 676 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 677 { 678 int total_len, len = orig->data_end - orig->data; 679 int headroom = XDP_PACKET_HEADROOM; 680 struct xdp_buff new; 681 void *frame; 682 int err; 683 684 if (rx->xsk_pool) 685 return gve_xsk_pool_redirect(dev, rx, orig->data, 686 len, xdp_prog); 687 688 total_len = headroom + SKB_DATA_ALIGN(len) + 689 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 690 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 691 if (!frame) { 692 u64_stats_update_begin(&rx->statss); 693 rx->xdp_alloc_fails++; 694 u64_stats_update_end(&rx->statss); 695 return -ENOMEM; 696 } 697 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 698 xdp_prepare_buff(&new, frame, headroom, len, false); 699 memcpy(new.data, orig->data, len); 700 701 err = xdp_do_redirect(dev, &new, xdp_prog); 702 if (err) 703 page_frag_free(frame); 704 705 return err; 706 } 707 708 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 709 struct xdp_buff *xdp, struct bpf_prog *xprog, 710 int xdp_act) 711 { 712 struct gve_tx_ring *tx; 713 int tx_qid; 714 int err; 715 716 switch (xdp_act) { 717 case XDP_ABORTED: 718 case XDP_DROP: 719 default: 720 break; 721 case XDP_TX: 722 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 723 tx = &priv->tx[tx_qid]; 724 spin_lock(&tx->xdp_lock); 725 err = gve_xdp_xmit_one(priv, tx, xdp->data, 726 xdp->data_end - xdp->data, NULL); 727 spin_unlock(&tx->xdp_lock); 728 729 if (unlikely(err)) { 730 u64_stats_update_begin(&rx->statss); 731 rx->xdp_tx_errors++; 732 u64_stats_update_end(&rx->statss); 733 } 734 break; 735 case XDP_REDIRECT: 736 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 737 738 if (unlikely(err)) { 739 u64_stats_update_begin(&rx->statss); 740 rx->xdp_redirect_errors++; 741 u64_stats_update_end(&rx->statss); 742 } 743 break; 744 } 745 u64_stats_update_begin(&rx->statss); 746 if ((u32)xdp_act < GVE_XDP_ACTIONS) 747 rx->xdp_actions[xdp_act]++; 748 u64_stats_update_end(&rx->statss); 749 } 750 751 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 752 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 753 struct gve_rx_desc *desc, u32 idx, 754 struct gve_rx_cnts *cnts) 755 { 756 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 757 struct gve_rx_slot_page_info *page_info; 758 u16 frag_size = be16_to_cpu(desc->len); 759 struct gve_rx_ctx *ctx = &rx->ctx; 760 union gve_rx_data_slot *data_slot; 761 struct gve_priv *priv = rx->gve; 762 struct sk_buff *skb = NULL; 763 struct bpf_prog *xprog; 764 struct xdp_buff xdp; 765 dma_addr_t page_bus; 766 void *va; 767 768 u16 len = frag_size; 769 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 770 bool is_first_frag = ctx->frag_cnt == 0; 771 772 bool is_only_frag = is_first_frag && is_last_frag; 773 774 if (unlikely(ctx->drop_pkt)) 775 goto finish_frag; 776 777 if (desc->flags_seq & GVE_RXF_ERR) { 778 ctx->drop_pkt = true; 779 cnts->desc_err_pkt_cnt++; 780 napi_free_frags(napi); 781 goto finish_frag; 782 } 783 784 if (unlikely(frag_size > rx->packet_buffer_size)) { 785 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 786 frag_size, rx->packet_buffer_size); 787 ctx->drop_pkt = true; 788 napi_free_frags(napi); 789 gve_schedule_reset(rx->gve); 790 goto finish_frag; 791 } 792 793 /* Prefetch two packet buffers ahead, we will need it soon. */ 794 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 795 va = page_info->page_address + page_info->page_offset; 796 prefetch(page_info->page); /* Kernel page struct. */ 797 prefetch(va); /* Packet header. */ 798 prefetch(va + 64); /* Next cacheline too. */ 799 800 page_info = &rx->data.page_info[idx]; 801 data_slot = &rx->data.data_ring[idx]; 802 page_bus = (rx->data.raw_addressing) ? 803 be64_to_cpu(data_slot->addr) - page_info->page_offset : 804 rx->data.qpl->page_buses[idx]; 805 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 806 PAGE_SIZE, DMA_FROM_DEVICE); 807 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 808 len -= page_info->pad; 809 frag_size -= page_info->pad; 810 811 xprog = READ_ONCE(priv->xdp_prog); 812 if (xprog && is_only_frag) { 813 void *old_data; 814 int xdp_act; 815 816 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 817 xdp_prepare_buff(&xdp, page_info->page_address + 818 page_info->page_offset, GVE_RX_PAD, 819 len, false); 820 old_data = xdp.data; 821 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 822 if (xdp_act != XDP_PASS) { 823 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 824 ctx->total_size += frag_size; 825 goto finish_ok_pkt; 826 } 827 828 page_info->pad += xdp.data - old_data; 829 len = xdp.data_end - xdp.data; 830 831 u64_stats_update_begin(&rx->statss); 832 rx->xdp_actions[XDP_PASS]++; 833 u64_stats_update_end(&rx->statss); 834 } 835 836 skb = gve_rx_skb(priv, rx, page_info, napi, len, 837 data_slot, is_only_frag); 838 if (!skb) { 839 u64_stats_update_begin(&rx->statss); 840 rx->rx_skb_alloc_fail++; 841 u64_stats_update_end(&rx->statss); 842 843 napi_free_frags(napi); 844 ctx->drop_pkt = true; 845 goto finish_frag; 846 } 847 ctx->total_size += frag_size; 848 849 if (is_first_frag) { 850 if (likely(feat & NETIF_F_RXCSUM)) { 851 /* NIC passes up the partial sum */ 852 if (desc->csum) 853 skb->ip_summed = CHECKSUM_COMPLETE; 854 else 855 skb->ip_summed = CHECKSUM_NONE; 856 skb->csum = csum_unfold(desc->csum); 857 } 858 859 /* parse flags & pass relevant info up */ 860 if (likely(feat & NETIF_F_RXHASH) && 861 gve_needs_rss(desc->flags_seq)) 862 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 863 gve_rss_type(desc->flags_seq)); 864 } 865 866 if (is_last_frag) { 867 skb_record_rx_queue(skb, rx->q_num); 868 if (skb_is_nonlinear(skb)) 869 napi_gro_frags(napi); 870 else 871 napi_gro_receive(napi, skb); 872 goto finish_ok_pkt; 873 } 874 875 goto finish_frag; 876 877 finish_ok_pkt: 878 cnts->ok_pkt_bytes += ctx->total_size; 879 cnts->ok_pkt_cnt++; 880 finish_frag: 881 ctx->frag_cnt++; 882 if (is_last_frag) { 883 cnts->total_pkt_cnt++; 884 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 885 gve_rx_ctx_clear(ctx); 886 } 887 } 888 889 bool gve_rx_work_pending(struct gve_rx_ring *rx) 890 { 891 struct gve_rx_desc *desc; 892 __be16 flags_seq; 893 u32 next_idx; 894 895 next_idx = rx->cnt & rx->mask; 896 desc = rx->desc.desc_ring + next_idx; 897 898 flags_seq = desc->flags_seq; 899 900 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 901 } 902 903 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 904 { 905 int refill_target = rx->mask + 1; 906 u32 fill_cnt = rx->fill_cnt; 907 908 while (fill_cnt - rx->cnt < refill_target) { 909 struct gve_rx_slot_page_info *page_info; 910 u32 idx = fill_cnt & rx->mask; 911 912 page_info = &rx->data.page_info[idx]; 913 if (page_info->can_flip) { 914 /* The other half of the page is free because it was 915 * free when we processed the descriptor. Flip to it. 916 */ 917 union gve_rx_data_slot *data_slot = 918 &rx->data.data_ring[idx]; 919 920 gve_rx_flip_buff(page_info, &data_slot->addr); 921 page_info->can_flip = 0; 922 } else { 923 /* It is possible that the networking stack has already 924 * finished processing all outstanding packets in the buffer 925 * and it can be reused. 926 * Flipping is unnecessary here - if the networking stack still 927 * owns half the page it is impossible to tell which half. Either 928 * the whole page is free or it needs to be replaced. 929 */ 930 int recycle = gve_rx_can_recycle_buffer(page_info); 931 932 if (recycle < 0) { 933 if (!rx->data.raw_addressing) 934 gve_schedule_reset(priv); 935 return false; 936 } 937 if (!recycle) { 938 /* We can't reuse the buffer - alloc a new one*/ 939 union gve_rx_data_slot *data_slot = 940 &rx->data.data_ring[idx]; 941 struct device *dev = &priv->pdev->dev; 942 gve_rx_free_buffer(dev, page_info, data_slot); 943 page_info->page = NULL; 944 if (gve_rx_alloc_buffer(priv, dev, page_info, 945 data_slot, rx)) { 946 break; 947 } 948 } 949 } 950 fill_cnt++; 951 } 952 rx->fill_cnt = fill_cnt; 953 return true; 954 } 955 956 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 957 netdev_features_t feat) 958 { 959 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 960 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 961 struct gve_rx_ctx *ctx = &rx->ctx; 962 struct gve_priv *priv = rx->gve; 963 struct gve_rx_cnts cnts = {0}; 964 struct gve_rx_desc *next_desc; 965 u32 idx = rx->cnt & rx->mask; 966 u32 work_done = 0; 967 968 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 969 970 // Exceed budget only if (and till) the inflight packet is consumed. 971 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 972 (work_done < budget || ctx->frag_cnt)) { 973 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 974 prefetch(next_desc); 975 976 gve_rx(rx, feat, desc, idx, &cnts); 977 978 rx->cnt++; 979 idx = rx->cnt & rx->mask; 980 desc = &rx->desc.desc_ring[idx]; 981 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 982 work_done++; 983 } 984 985 // The device will only send whole packets. 986 if (unlikely(ctx->frag_cnt)) { 987 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 988 989 napi_free_frags(napi); 990 gve_rx_ctx_clear(&rx->ctx); 991 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 992 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 993 gve_schedule_reset(rx->gve); 994 } 995 996 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 997 return 0; 998 999 if (work_done) { 1000 u64_stats_update_begin(&rx->statss); 1001 rx->rpackets += cnts.ok_pkt_cnt; 1002 rx->rbytes += cnts.ok_pkt_bytes; 1003 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 1004 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 1005 u64_stats_update_end(&rx->statss); 1006 } 1007 1008 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1009 gve_xdp_tx_flush(priv, rx->q_num); 1010 1011 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1012 xdp_do_flush(); 1013 1014 /* restock ring slots */ 1015 if (!rx->data.raw_addressing) { 1016 /* In QPL mode buffs are refilled as the desc are processed */ 1017 rx->fill_cnt += work_done; 1018 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1019 /* In raw addressing mode buffs are only refilled if the avail 1020 * falls below a threshold. 1021 */ 1022 if (!gve_rx_refill_buffers(priv, rx)) 1023 return 0; 1024 1025 /* If we were not able to completely refill buffers, we'll want 1026 * to schedule this queue for work again to refill buffers. 1027 */ 1028 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1029 gve_rx_write_doorbell(priv, rx); 1030 return budget; 1031 } 1032 } 1033 1034 gve_rx_write_doorbell(priv, rx); 1035 return cnts.total_pkt_cnt; 1036 } 1037 1038 int gve_rx_poll(struct gve_notify_block *block, int budget) 1039 { 1040 struct gve_rx_ring *rx = block->rx; 1041 netdev_features_t feat; 1042 int work_done = 0; 1043 1044 feat = block->napi.dev->features; 1045 1046 if (budget > 0) 1047 work_done = gve_clean_rx_done(rx, budget, feat); 1048 1049 return work_done; 1050 } 1051