1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) 27 { 28 u32 slots = rx->mask + 1; 29 int i; 30 31 if (rx->data.raw_addressing) { 32 for (i = 0; i < slots; i++) 33 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 34 &rx->data.data_ring[i]); 35 } else { 36 for (i = 0; i < slots; i++) 37 page_ref_sub(rx->data.page_info[i].page, 38 rx->data.page_info[i].pagecnt_bias - 1); 39 gve_unassign_qpl(priv, rx->data.qpl->id); 40 rx->data.qpl = NULL; 41 42 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 43 page_ref_sub(rx->qpl_copy_pool[i].page, 44 rx->qpl_copy_pool[i].pagecnt_bias - 1); 45 put_page(rx->qpl_copy_pool[i].page); 46 } 47 } 48 kvfree(rx->data.page_info); 49 rx->data.page_info = NULL; 50 } 51 52 static void gve_rx_free_ring(struct gve_priv *priv, int idx) 53 { 54 struct gve_rx_ring *rx = &priv->rx[idx]; 55 struct device *dev = &priv->pdev->dev; 56 u32 slots = rx->mask + 1; 57 size_t bytes; 58 59 gve_rx_remove_from_block(priv, idx); 60 61 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 62 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 63 rx->desc.desc_ring = NULL; 64 65 dma_free_coherent(dev, sizeof(*rx->q_resources), 66 rx->q_resources, rx->q_resources_bus); 67 rx->q_resources = NULL; 68 69 gve_rx_unfill_pages(priv, rx); 70 71 bytes = sizeof(*rx->data.data_ring) * slots; 72 dma_free_coherent(dev, bytes, rx->data.data_ring, 73 rx->data.data_bus); 74 rx->data.data_ring = NULL; 75 76 kvfree(rx->qpl_copy_pool); 77 rx->qpl_copy_pool = NULL; 78 79 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 80 } 81 82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 83 dma_addr_t addr, struct page *page, __be64 *slot_addr) 84 { 85 page_info->page = page; 86 page_info->page_offset = 0; 87 page_info->page_address = page_address(page); 88 *slot_addr = cpu_to_be64(addr); 89 /* The page already has 1 ref */ 90 page_ref_add(page, INT_MAX - 1); 91 page_info->pagecnt_bias = INT_MAX; 92 } 93 94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 95 struct gve_rx_slot_page_info *page_info, 96 union gve_rx_data_slot *data_slot) 97 { 98 struct page *page; 99 dma_addr_t dma; 100 int err; 101 102 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 103 GFP_ATOMIC); 104 if (err) 105 return err; 106 107 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 108 return 0; 109 } 110 111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx) 112 { 113 struct gve_priv *priv = rx->gve; 114 u32 slots; 115 int err; 116 int i; 117 int j; 118 119 /* Allocate one page per Rx queue slot. Each page is split into two 120 * packet buffers, when possible we "page flip" between the two. 121 */ 122 slots = rx->mask + 1; 123 124 rx->data.page_info = kvzalloc(slots * 125 sizeof(*rx->data.page_info), GFP_KERNEL); 126 if (!rx->data.page_info) 127 return -ENOMEM; 128 129 if (!rx->data.raw_addressing) { 130 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num); 131 if (!rx->data.qpl) { 132 kvfree(rx->data.page_info); 133 rx->data.page_info = NULL; 134 return -ENOMEM; 135 } 136 } 137 for (i = 0; i < slots; i++) { 138 if (!rx->data.raw_addressing) { 139 struct page *page = rx->data.qpl->pages[i]; 140 dma_addr_t addr = i * PAGE_SIZE; 141 142 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 143 &rx->data.data_ring[i].qpl_offset); 144 continue; 145 } 146 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], 147 &rx->data.data_ring[i]); 148 if (err) 149 goto alloc_err_rda; 150 } 151 152 if (!rx->data.raw_addressing) { 153 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 154 struct page *page = alloc_page(GFP_KERNEL); 155 156 if (!page) { 157 err = -ENOMEM; 158 goto alloc_err_qpl; 159 } 160 161 rx->qpl_copy_pool[j].page = page; 162 rx->qpl_copy_pool[j].page_offset = 0; 163 rx->qpl_copy_pool[j].page_address = page_address(page); 164 165 /* The page already has 1 ref. */ 166 page_ref_add(page, INT_MAX - 1); 167 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 168 } 169 } 170 171 return slots; 172 173 alloc_err_qpl: 174 /* Fully free the copy pool pages. */ 175 while (j--) { 176 page_ref_sub(rx->qpl_copy_pool[j].page, 177 rx->qpl_copy_pool[j].pagecnt_bias - 1); 178 put_page(rx->qpl_copy_pool[j].page); 179 } 180 181 /* Do not fully free QPL pages - only remove the bias added in this 182 * function with gve_setup_rx_buffer. 183 */ 184 while (i--) 185 page_ref_sub(rx->data.page_info[i].page, 186 rx->data.page_info[i].pagecnt_bias - 1); 187 188 gve_unassign_qpl(priv, rx->data.qpl->id); 189 rx->data.qpl = NULL; 190 191 return err; 192 193 alloc_err_rda: 194 while (i--) 195 gve_rx_free_buffer(&priv->pdev->dev, 196 &rx->data.page_info[i], 197 &rx->data.data_ring[i]); 198 return err; 199 } 200 201 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 202 { 203 ctx->skb_head = NULL; 204 ctx->skb_tail = NULL; 205 ctx->total_size = 0; 206 ctx->frag_cnt = 0; 207 ctx->drop_pkt = false; 208 } 209 210 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) 211 { 212 struct gve_rx_ring *rx = &priv->rx[idx]; 213 struct device *hdev = &priv->pdev->dev; 214 int filled_pages; 215 size_t bytes; 216 u32 slots; 217 int err; 218 219 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 220 /* Make sure everything is zeroed to start with */ 221 memset(rx, 0, sizeof(*rx)); 222 223 rx->gve = priv; 224 rx->q_num = idx; 225 226 slots = priv->rx_data_slot_cnt; 227 rx->mask = slots - 1; 228 rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; 229 230 /* alloc rx data ring */ 231 bytes = sizeof(*rx->data.data_ring) * slots; 232 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 233 &rx->data.data_bus, 234 GFP_KERNEL); 235 if (!rx->data.data_ring) 236 return -ENOMEM; 237 238 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 239 rx->qpl_copy_pool_head = 0; 240 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 241 sizeof(rx->qpl_copy_pool[0]), 242 GFP_KERNEL); 243 244 if (!rx->qpl_copy_pool) { 245 err = -ENOMEM; 246 goto abort_with_slots; 247 } 248 249 filled_pages = gve_prefill_rx_pages(rx); 250 if (filled_pages < 0) { 251 err = -ENOMEM; 252 goto abort_with_copy_pool; 253 } 254 rx->fill_cnt = filled_pages; 255 /* Ensure data ring slots (packet buffers) are visible. */ 256 dma_wmb(); 257 258 /* Alloc gve_queue_resources */ 259 rx->q_resources = 260 dma_alloc_coherent(hdev, 261 sizeof(*rx->q_resources), 262 &rx->q_resources_bus, 263 GFP_KERNEL); 264 if (!rx->q_resources) { 265 err = -ENOMEM; 266 goto abort_filled; 267 } 268 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 269 (unsigned long)rx->data.data_bus); 270 271 /* alloc rx desc ring */ 272 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 273 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 274 GFP_KERNEL); 275 if (!rx->desc.desc_ring) { 276 err = -ENOMEM; 277 goto abort_with_q_resources; 278 } 279 rx->cnt = 0; 280 rx->db_threshold = priv->rx_desc_cnt / 2; 281 rx->desc.seqno = 1; 282 283 /* Allocating half-page buffers allows page-flipping which is faster 284 * than copying or allocating new pages. 285 */ 286 rx->packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE; 287 gve_rx_ctx_clear(&rx->ctx); 288 gve_rx_add_to_block(priv, idx); 289 290 return 0; 291 292 abort_with_q_resources: 293 dma_free_coherent(hdev, sizeof(*rx->q_resources), 294 rx->q_resources, rx->q_resources_bus); 295 rx->q_resources = NULL; 296 abort_filled: 297 gve_rx_unfill_pages(priv, rx); 298 abort_with_copy_pool: 299 kvfree(rx->qpl_copy_pool); 300 rx->qpl_copy_pool = NULL; 301 abort_with_slots: 302 bytes = sizeof(*rx->data.data_ring) * slots; 303 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 304 rx->data.data_ring = NULL; 305 306 return err; 307 } 308 309 int gve_rx_alloc_rings(struct gve_priv *priv) 310 { 311 int err = 0; 312 int i; 313 314 for (i = 0; i < priv->rx_cfg.num_queues; i++) { 315 err = gve_rx_alloc_ring(priv, i); 316 if (err) { 317 netif_err(priv, drv, priv->dev, 318 "Failed to alloc rx ring=%d: err=%d\n", 319 i, err); 320 break; 321 } 322 } 323 /* Unallocate if there was an error */ 324 if (err) { 325 int j; 326 327 for (j = 0; j < i; j++) 328 gve_rx_free_ring(priv, j); 329 } 330 return err; 331 } 332 333 void gve_rx_free_rings_gqi(struct gve_priv *priv) 334 { 335 int i; 336 337 for (i = 0; i < priv->rx_cfg.num_queues; i++) 338 gve_rx_free_ring(priv, i); 339 } 340 341 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 342 { 343 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 344 345 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 346 } 347 348 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 349 { 350 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 351 return PKT_HASH_TYPE_L4; 352 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 353 return PKT_HASH_TYPE_L3; 354 return PKT_HASH_TYPE_L2; 355 } 356 357 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 358 struct gve_rx_slot_page_info *page_info, 359 u16 packet_buffer_size, u16 len, 360 struct gve_rx_ctx *ctx) 361 { 362 u32 offset = page_info->page_offset + page_info->pad; 363 struct sk_buff *skb = ctx->skb_tail; 364 int num_frags = 0; 365 366 if (!skb) { 367 skb = napi_get_frags(napi); 368 if (unlikely(!skb)) 369 return NULL; 370 371 ctx->skb_head = skb; 372 ctx->skb_tail = skb; 373 } else { 374 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 375 if (num_frags == MAX_SKB_FRAGS) { 376 skb = napi_alloc_skb(napi, 0); 377 if (!skb) 378 return NULL; 379 380 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 381 // which is why we do not need to chain by using skb->next 382 skb_shinfo(ctx->skb_tail)->frag_list = skb; 383 384 ctx->skb_tail = skb; 385 num_frags = 0; 386 } 387 } 388 389 if (skb != ctx->skb_head) { 390 ctx->skb_head->len += len; 391 ctx->skb_head->data_len += len; 392 ctx->skb_head->truesize += packet_buffer_size; 393 } 394 skb_add_rx_frag(skb, num_frags, page_info->page, 395 offset, len, packet_buffer_size); 396 397 return ctx->skb_head; 398 } 399 400 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 401 { 402 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); 403 404 /* "flip" to other packet buffer on this page */ 405 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 406 *(slot_addr) ^= offset; 407 } 408 409 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 410 { 411 int pagecount = page_count(page_info->page); 412 413 /* This page is not being used by any SKBs - reuse */ 414 if (pagecount == page_info->pagecnt_bias) 415 return 1; 416 /* This page is still being used by an SKB - we can't reuse */ 417 else if (pagecount > page_info->pagecnt_bias) 418 return 0; 419 WARN(pagecount < page_info->pagecnt_bias, 420 "Pagecount should never be less than the bias."); 421 return -1; 422 } 423 424 static struct sk_buff * 425 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 426 struct gve_rx_slot_page_info *page_info, u16 len, 427 struct napi_struct *napi, 428 union gve_rx_data_slot *data_slot, 429 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 430 { 431 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 432 433 if (!skb) 434 return NULL; 435 436 /* Optimistically stop the kernel from freeing the page. 437 * We will check again in refill to determine if we need to alloc a 438 * new page. 439 */ 440 gve_dec_pagecnt_bias(page_info); 441 442 return skb; 443 } 444 445 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 446 struct gve_rx_slot_page_info *page_info, 447 u16 len, struct napi_struct *napi) 448 { 449 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 450 void *src = page_info->page_address + page_info->page_offset; 451 struct gve_rx_slot_page_info *copy_page_info; 452 struct gve_rx_ctx *ctx = &rx->ctx; 453 bool alloc_page = false; 454 struct sk_buff *skb; 455 void *dst; 456 457 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 458 if (!copy_page_info->can_flip) { 459 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 460 461 if (unlikely(recycle < 0)) { 462 gve_schedule_reset(rx->gve); 463 return NULL; 464 } 465 alloc_page = !recycle; 466 } 467 468 if (alloc_page) { 469 struct gve_rx_slot_page_info alloc_page_info; 470 struct page *page; 471 472 /* The least recently used page turned out to be 473 * still in use by the kernel. Ignoring it and moving 474 * on alleviates head-of-line blocking. 475 */ 476 rx->qpl_copy_pool_head++; 477 478 page = alloc_page(GFP_ATOMIC); 479 if (!page) 480 return NULL; 481 482 alloc_page_info.page = page; 483 alloc_page_info.page_offset = 0; 484 alloc_page_info.page_address = page_address(page); 485 alloc_page_info.pad = page_info->pad; 486 487 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 488 skb = gve_rx_add_frags(napi, &alloc_page_info, 489 rx->packet_buffer_size, 490 len, ctx); 491 492 u64_stats_update_begin(&rx->statss); 493 rx->rx_frag_copy_cnt++; 494 rx->rx_frag_alloc_cnt++; 495 u64_stats_update_end(&rx->statss); 496 497 return skb; 498 } 499 500 dst = copy_page_info->page_address + copy_page_info->page_offset; 501 memcpy(dst, src, page_info->pad + len); 502 copy_page_info->pad = page_info->pad; 503 504 skb = gve_rx_add_frags(napi, copy_page_info, 505 rx->packet_buffer_size, len, ctx); 506 if (unlikely(!skb)) 507 return NULL; 508 509 gve_dec_pagecnt_bias(copy_page_info); 510 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 511 512 if (copy_page_info->can_flip) { 513 /* We have used both halves of this copy page, it 514 * is time for it to go to the back of the queue. 515 */ 516 copy_page_info->can_flip = false; 517 rx->qpl_copy_pool_head++; 518 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 519 } else { 520 copy_page_info->can_flip = true; 521 } 522 523 u64_stats_update_begin(&rx->statss); 524 rx->rx_frag_copy_cnt++; 525 u64_stats_update_end(&rx->statss); 526 527 return skb; 528 } 529 530 static struct sk_buff * 531 gve_rx_qpl(struct device *dev, struct net_device *netdev, 532 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 533 u16 len, struct napi_struct *napi, 534 union gve_rx_data_slot *data_slot) 535 { 536 struct gve_rx_ctx *ctx = &rx->ctx; 537 struct sk_buff *skb; 538 539 /* if raw_addressing mode is not enabled gvnic can only receive into 540 * registered segments. If the buffer can't be recycled, our only 541 * choice is to copy the data out of it so that we can return it to the 542 * device. 543 */ 544 if (page_info->can_flip) { 545 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 546 /* No point in recycling if we didn't get the skb */ 547 if (skb) { 548 /* Make sure that the page isn't freed. */ 549 gve_dec_pagecnt_bias(page_info); 550 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 551 } 552 } else { 553 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 554 } 555 return skb; 556 } 557 558 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 559 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 560 u16 len, union gve_rx_data_slot *data_slot, 561 bool is_only_frag) 562 { 563 struct net_device *netdev = priv->dev; 564 struct gve_rx_ctx *ctx = &rx->ctx; 565 struct sk_buff *skb = NULL; 566 567 if (len <= priv->rx_copybreak && is_only_frag) { 568 /* Just copy small packets */ 569 skb = gve_rx_copy(netdev, napi, page_info, len); 570 if (skb) { 571 u64_stats_update_begin(&rx->statss); 572 rx->rx_copied_pkt++; 573 rx->rx_frag_copy_cnt++; 574 rx->rx_copybreak_pkt++; 575 u64_stats_update_end(&rx->statss); 576 } 577 } else { 578 int recycle = gve_rx_can_recycle_buffer(page_info); 579 580 if (unlikely(recycle < 0)) { 581 gve_schedule_reset(priv); 582 return NULL; 583 } 584 page_info->can_flip = recycle; 585 if (page_info->can_flip) { 586 u64_stats_update_begin(&rx->statss); 587 rx->rx_frag_flip_cnt++; 588 u64_stats_update_end(&rx->statss); 589 } 590 591 if (rx->data.raw_addressing) { 592 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 593 page_info, len, napi, 594 data_slot, 595 rx->packet_buffer_size, ctx); 596 } else { 597 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 598 page_info, len, napi, data_slot); 599 } 600 } 601 return skb; 602 } 603 604 static int gve_xsk_pool_redirect(struct net_device *dev, 605 struct gve_rx_ring *rx, 606 void *data, int len, 607 struct bpf_prog *xdp_prog) 608 { 609 struct xdp_buff *xdp; 610 int err; 611 612 if (rx->xsk_pool->frame_len < len) 613 return -E2BIG; 614 xdp = xsk_buff_alloc(rx->xsk_pool); 615 if (!xdp) { 616 u64_stats_update_begin(&rx->statss); 617 rx->xdp_alloc_fails++; 618 u64_stats_update_end(&rx->statss); 619 return -ENOMEM; 620 } 621 xdp->data_end = xdp->data + len; 622 memcpy(xdp->data, data, len); 623 err = xdp_do_redirect(dev, xdp, xdp_prog); 624 if (err) 625 xsk_buff_free(xdp); 626 return err; 627 } 628 629 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 630 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 631 { 632 int total_len, len = orig->data_end - orig->data; 633 int headroom = XDP_PACKET_HEADROOM; 634 struct xdp_buff new; 635 void *frame; 636 int err; 637 638 if (rx->xsk_pool) 639 return gve_xsk_pool_redirect(dev, rx, orig->data, 640 len, xdp_prog); 641 642 total_len = headroom + SKB_DATA_ALIGN(len) + 643 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 644 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 645 if (!frame) { 646 u64_stats_update_begin(&rx->statss); 647 rx->xdp_alloc_fails++; 648 u64_stats_update_end(&rx->statss); 649 return -ENOMEM; 650 } 651 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 652 xdp_prepare_buff(&new, frame, headroom, len, false); 653 memcpy(new.data, orig->data, len); 654 655 err = xdp_do_redirect(dev, &new, xdp_prog); 656 if (err) 657 page_frag_free(frame); 658 659 return err; 660 } 661 662 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 663 struct xdp_buff *xdp, struct bpf_prog *xprog, 664 int xdp_act) 665 { 666 struct gve_tx_ring *tx; 667 int tx_qid; 668 int err; 669 670 switch (xdp_act) { 671 case XDP_ABORTED: 672 case XDP_DROP: 673 default: 674 break; 675 case XDP_TX: 676 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 677 tx = &priv->tx[tx_qid]; 678 spin_lock(&tx->xdp_lock); 679 err = gve_xdp_xmit_one(priv, tx, xdp->data, 680 xdp->data_end - xdp->data, NULL); 681 spin_unlock(&tx->xdp_lock); 682 683 if (unlikely(err)) { 684 u64_stats_update_begin(&rx->statss); 685 rx->xdp_tx_errors++; 686 u64_stats_update_end(&rx->statss); 687 } 688 break; 689 case XDP_REDIRECT: 690 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 691 692 if (unlikely(err)) { 693 u64_stats_update_begin(&rx->statss); 694 rx->xdp_redirect_errors++; 695 u64_stats_update_end(&rx->statss); 696 } 697 break; 698 } 699 u64_stats_update_begin(&rx->statss); 700 if ((u32)xdp_act < GVE_XDP_ACTIONS) 701 rx->xdp_actions[xdp_act]++; 702 u64_stats_update_end(&rx->statss); 703 } 704 705 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 706 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 707 struct gve_rx_desc *desc, u32 idx, 708 struct gve_rx_cnts *cnts) 709 { 710 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 711 struct gve_rx_slot_page_info *page_info; 712 u16 frag_size = be16_to_cpu(desc->len); 713 struct gve_rx_ctx *ctx = &rx->ctx; 714 union gve_rx_data_slot *data_slot; 715 struct gve_priv *priv = rx->gve; 716 struct sk_buff *skb = NULL; 717 struct bpf_prog *xprog; 718 struct xdp_buff xdp; 719 dma_addr_t page_bus; 720 void *va; 721 722 u16 len = frag_size; 723 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 724 bool is_first_frag = ctx->frag_cnt == 0; 725 726 bool is_only_frag = is_first_frag && is_last_frag; 727 728 if (unlikely(ctx->drop_pkt)) 729 goto finish_frag; 730 731 if (desc->flags_seq & GVE_RXF_ERR) { 732 ctx->drop_pkt = true; 733 cnts->desc_err_pkt_cnt++; 734 napi_free_frags(napi); 735 goto finish_frag; 736 } 737 738 if (unlikely(frag_size > rx->packet_buffer_size)) { 739 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 740 frag_size, rx->packet_buffer_size); 741 ctx->drop_pkt = true; 742 napi_free_frags(napi); 743 gve_schedule_reset(rx->gve); 744 goto finish_frag; 745 } 746 747 /* Prefetch two packet buffers ahead, we will need it soon. */ 748 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 749 va = page_info->page_address + page_info->page_offset; 750 prefetch(page_info->page); /* Kernel page struct. */ 751 prefetch(va); /* Packet header. */ 752 prefetch(va + 64); /* Next cacheline too. */ 753 754 page_info = &rx->data.page_info[idx]; 755 data_slot = &rx->data.data_ring[idx]; 756 page_bus = (rx->data.raw_addressing) ? 757 be64_to_cpu(data_slot->addr) - page_info->page_offset : 758 rx->data.qpl->page_buses[idx]; 759 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 760 PAGE_SIZE, DMA_FROM_DEVICE); 761 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 762 len -= page_info->pad; 763 frag_size -= page_info->pad; 764 765 xprog = READ_ONCE(priv->xdp_prog); 766 if (xprog && is_only_frag) { 767 void *old_data; 768 int xdp_act; 769 770 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 771 xdp_prepare_buff(&xdp, page_info->page_address + 772 page_info->page_offset, GVE_RX_PAD, 773 len, false); 774 old_data = xdp.data; 775 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 776 if (xdp_act != XDP_PASS) { 777 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 778 ctx->total_size += frag_size; 779 goto finish_ok_pkt; 780 } 781 782 page_info->pad += xdp.data - old_data; 783 len = xdp.data_end - xdp.data; 784 785 u64_stats_update_begin(&rx->statss); 786 rx->xdp_actions[XDP_PASS]++; 787 u64_stats_update_end(&rx->statss); 788 } 789 790 skb = gve_rx_skb(priv, rx, page_info, napi, len, 791 data_slot, is_only_frag); 792 if (!skb) { 793 u64_stats_update_begin(&rx->statss); 794 rx->rx_skb_alloc_fail++; 795 u64_stats_update_end(&rx->statss); 796 797 napi_free_frags(napi); 798 ctx->drop_pkt = true; 799 goto finish_frag; 800 } 801 ctx->total_size += frag_size; 802 803 if (is_first_frag) { 804 if (likely(feat & NETIF_F_RXCSUM)) { 805 /* NIC passes up the partial sum */ 806 if (desc->csum) 807 skb->ip_summed = CHECKSUM_COMPLETE; 808 else 809 skb->ip_summed = CHECKSUM_NONE; 810 skb->csum = csum_unfold(desc->csum); 811 } 812 813 /* parse flags & pass relevant info up */ 814 if (likely(feat & NETIF_F_RXHASH) && 815 gve_needs_rss(desc->flags_seq)) 816 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 817 gve_rss_type(desc->flags_seq)); 818 } 819 820 if (is_last_frag) { 821 skb_record_rx_queue(skb, rx->q_num); 822 if (skb_is_nonlinear(skb)) 823 napi_gro_frags(napi); 824 else 825 napi_gro_receive(napi, skb); 826 goto finish_ok_pkt; 827 } 828 829 goto finish_frag; 830 831 finish_ok_pkt: 832 cnts->ok_pkt_bytes += ctx->total_size; 833 cnts->ok_pkt_cnt++; 834 finish_frag: 835 ctx->frag_cnt++; 836 if (is_last_frag) { 837 cnts->total_pkt_cnt++; 838 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 839 gve_rx_ctx_clear(ctx); 840 } 841 } 842 843 bool gve_rx_work_pending(struct gve_rx_ring *rx) 844 { 845 struct gve_rx_desc *desc; 846 __be16 flags_seq; 847 u32 next_idx; 848 849 next_idx = rx->cnt & rx->mask; 850 desc = rx->desc.desc_ring + next_idx; 851 852 flags_seq = desc->flags_seq; 853 854 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 855 } 856 857 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 858 { 859 int refill_target = rx->mask + 1; 860 u32 fill_cnt = rx->fill_cnt; 861 862 while (fill_cnt - rx->cnt < refill_target) { 863 struct gve_rx_slot_page_info *page_info; 864 u32 idx = fill_cnt & rx->mask; 865 866 page_info = &rx->data.page_info[idx]; 867 if (page_info->can_flip) { 868 /* The other half of the page is free because it was 869 * free when we processed the descriptor. Flip to it. 870 */ 871 union gve_rx_data_slot *data_slot = 872 &rx->data.data_ring[idx]; 873 874 gve_rx_flip_buff(page_info, &data_slot->addr); 875 page_info->can_flip = 0; 876 } else { 877 /* It is possible that the networking stack has already 878 * finished processing all outstanding packets in the buffer 879 * and it can be reused. 880 * Flipping is unnecessary here - if the networking stack still 881 * owns half the page it is impossible to tell which half. Either 882 * the whole page is free or it needs to be replaced. 883 */ 884 int recycle = gve_rx_can_recycle_buffer(page_info); 885 886 if (recycle < 0) { 887 if (!rx->data.raw_addressing) 888 gve_schedule_reset(priv); 889 return false; 890 } 891 if (!recycle) { 892 /* We can't reuse the buffer - alloc a new one*/ 893 union gve_rx_data_slot *data_slot = 894 &rx->data.data_ring[idx]; 895 struct device *dev = &priv->pdev->dev; 896 gve_rx_free_buffer(dev, page_info, data_slot); 897 page_info->page = NULL; 898 if (gve_rx_alloc_buffer(priv, dev, page_info, 899 data_slot)) { 900 u64_stats_update_begin(&rx->statss); 901 rx->rx_buf_alloc_fail++; 902 u64_stats_update_end(&rx->statss); 903 break; 904 } 905 } 906 } 907 fill_cnt++; 908 } 909 rx->fill_cnt = fill_cnt; 910 return true; 911 } 912 913 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 914 netdev_features_t feat) 915 { 916 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 917 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 918 struct gve_rx_ctx *ctx = &rx->ctx; 919 struct gve_priv *priv = rx->gve; 920 struct gve_rx_cnts cnts = {0}; 921 struct gve_rx_desc *next_desc; 922 u32 idx = rx->cnt & rx->mask; 923 u32 work_done = 0; 924 925 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 926 927 // Exceed budget only if (and till) the inflight packet is consumed. 928 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 929 (work_done < budget || ctx->frag_cnt)) { 930 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 931 prefetch(next_desc); 932 933 gve_rx(rx, feat, desc, idx, &cnts); 934 935 rx->cnt++; 936 idx = rx->cnt & rx->mask; 937 desc = &rx->desc.desc_ring[idx]; 938 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 939 work_done++; 940 } 941 942 // The device will only send whole packets. 943 if (unlikely(ctx->frag_cnt)) { 944 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 945 946 napi_free_frags(napi); 947 gve_rx_ctx_clear(&rx->ctx); 948 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 949 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 950 gve_schedule_reset(rx->gve); 951 } 952 953 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 954 return 0; 955 956 if (work_done) { 957 u64_stats_update_begin(&rx->statss); 958 rx->rpackets += cnts.ok_pkt_cnt; 959 rx->rbytes += cnts.ok_pkt_bytes; 960 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 961 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 962 u64_stats_update_end(&rx->statss); 963 } 964 965 if (xdp_txs != rx->xdp_actions[XDP_TX]) 966 gve_xdp_tx_flush(priv, rx->q_num); 967 968 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 969 xdp_do_flush(); 970 971 /* restock ring slots */ 972 if (!rx->data.raw_addressing) { 973 /* In QPL mode buffs are refilled as the desc are processed */ 974 rx->fill_cnt += work_done; 975 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 976 /* In raw addressing mode buffs are only refilled if the avail 977 * falls below a threshold. 978 */ 979 if (!gve_rx_refill_buffers(priv, rx)) 980 return 0; 981 982 /* If we were not able to completely refill buffers, we'll want 983 * to schedule this queue for work again to refill buffers. 984 */ 985 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 986 gve_rx_write_doorbell(priv, rx); 987 return budget; 988 } 989 } 990 991 gve_rx_write_doorbell(priv, rx); 992 return cnts.total_pkt_cnt; 993 } 994 995 int gve_rx_poll(struct gve_notify_block *block, int budget) 996 { 997 struct gve_rx_ring *rx = block->rx; 998 netdev_features_t feat; 999 int work_done = 0; 1000 1001 feat = block->napi.dev->features; 1002 1003 if (budget > 0) 1004 work_done = gve_clean_rx_done(rx, budget, feat); 1005 1006 return work_done; 1007 } 1008