1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, 27 struct gve_rx_ring *rx, 28 struct gve_rx_alloc_rings_cfg *cfg) 29 { 30 u32 slots = rx->mask + 1; 31 int i; 32 33 if (!rx->data.page_info) 34 return; 35 36 if (rx->data.raw_addressing) { 37 for (i = 0; i < slots; i++) 38 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 39 &rx->data.data_ring[i]); 40 } else { 41 for (i = 0; i < slots; i++) 42 page_ref_sub(rx->data.page_info[i].page, 43 rx->data.page_info[i].pagecnt_bias - 1); 44 45 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 46 page_ref_sub(rx->qpl_copy_pool[i].page, 47 rx->qpl_copy_pool[i].pagecnt_bias - 1); 48 put_page(rx->qpl_copy_pool[i].page); 49 } 50 } 51 kvfree(rx->data.page_info); 52 rx->data.page_info = NULL; 53 } 54 55 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 56 { 57 ctx->skb_head = NULL; 58 ctx->skb_tail = NULL; 59 ctx->total_size = 0; 60 ctx->frag_cnt = 0; 61 ctx->drop_pkt = false; 62 } 63 64 static void gve_rx_init_ring_state_gqi(struct gve_rx_ring *rx) 65 { 66 rx->desc.seqno = 1; 67 rx->cnt = 0; 68 gve_rx_ctx_clear(&rx->ctx); 69 } 70 71 static void gve_rx_reset_ring_gqi(struct gve_priv *priv, int idx) 72 { 73 struct gve_rx_ring *rx = &priv->rx[idx]; 74 const u32 slots = priv->rx_desc_cnt; 75 size_t size; 76 77 /* Reset desc ring */ 78 if (rx->desc.desc_ring) { 79 size = slots * sizeof(rx->desc.desc_ring[0]); 80 memset(rx->desc.desc_ring, 0, size); 81 } 82 83 /* Reset q_resources */ 84 if (rx->q_resources) 85 memset(rx->q_resources, 0, sizeof(*rx->q_resources)); 86 87 gve_rx_init_ring_state_gqi(rx); 88 } 89 90 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx) 91 { 92 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 93 94 if (!gve_rx_was_added_to_block(priv, idx)) 95 return; 96 97 gve_remove_napi(priv, ntfy_idx); 98 gve_rx_remove_from_block(priv, idx); 99 gve_rx_reset_ring_gqi(priv, idx); 100 } 101 102 void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx, 103 struct gve_rx_alloc_rings_cfg *cfg) 104 { 105 struct device *dev = &priv->pdev->dev; 106 u32 slots = rx->mask + 1; 107 int idx = rx->q_num; 108 size_t bytes; 109 u32 qpl_id; 110 111 if (rx->desc.desc_ring) { 112 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 113 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 114 rx->desc.desc_ring = NULL; 115 } 116 117 if (rx->q_resources) { 118 dma_free_coherent(dev, sizeof(*rx->q_resources), 119 rx->q_resources, rx->q_resources_bus); 120 rx->q_resources = NULL; 121 } 122 123 gve_rx_unfill_pages(priv, rx, cfg); 124 125 if (rx->data.data_ring) { 126 bytes = sizeof(*rx->data.data_ring) * slots; 127 dma_free_coherent(dev, bytes, rx->data.data_ring, 128 rx->data.data_bus); 129 rx->data.data_ring = NULL; 130 } 131 132 kvfree(rx->qpl_copy_pool); 133 rx->qpl_copy_pool = NULL; 134 135 if (rx->data.qpl) { 136 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, idx); 137 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id); 138 rx->data.qpl = NULL; 139 } 140 141 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 142 } 143 144 static void gve_setup_rx_buffer(struct gve_rx_ring *rx, 145 struct gve_rx_slot_page_info *page_info, 146 dma_addr_t addr, struct page *page, 147 __be64 *slot_addr) 148 { 149 page_info->page = page; 150 page_info->page_offset = 0; 151 page_info->page_address = page_address(page); 152 page_info->buf_size = rx->packet_buffer_size; 153 *slot_addr = cpu_to_be64(addr); 154 /* The page already has 1 ref */ 155 page_ref_add(page, INT_MAX - 1); 156 page_info->pagecnt_bias = INT_MAX; 157 } 158 159 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 160 struct gve_rx_slot_page_info *page_info, 161 union gve_rx_data_slot *data_slot, 162 struct gve_rx_ring *rx) 163 { 164 struct page *page; 165 dma_addr_t dma; 166 int err; 167 168 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 169 GFP_ATOMIC); 170 if (err) { 171 u64_stats_update_begin(&rx->statss); 172 rx->rx_buf_alloc_fail++; 173 u64_stats_update_end(&rx->statss); 174 return err; 175 } 176 177 gve_setup_rx_buffer(rx, page_info, dma, page, &data_slot->addr); 178 return 0; 179 } 180 181 static int gve_rx_prefill_pages(struct gve_rx_ring *rx, 182 struct gve_rx_alloc_rings_cfg *cfg) 183 { 184 struct gve_priv *priv = rx->gve; 185 u32 slots; 186 int err; 187 int i; 188 int j; 189 190 /* Allocate one page per Rx queue slot. Each page is split into two 191 * packet buffers, when possible we "page flip" between the two. 192 */ 193 slots = rx->mask + 1; 194 195 rx->data.page_info = kvzalloc(slots * 196 sizeof(*rx->data.page_info), GFP_KERNEL); 197 if (!rx->data.page_info) 198 return -ENOMEM; 199 200 for (i = 0; i < slots; i++) { 201 if (!rx->data.raw_addressing) { 202 struct page *page = rx->data.qpl->pages[i]; 203 dma_addr_t addr = i * PAGE_SIZE; 204 205 gve_setup_rx_buffer(rx, &rx->data.page_info[i], addr, 206 page, 207 &rx->data.data_ring[i].qpl_offset); 208 continue; 209 } 210 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, 211 &rx->data.page_info[i], 212 &rx->data.data_ring[i], rx); 213 if (err) 214 goto alloc_err_rda; 215 } 216 217 if (!rx->data.raw_addressing) { 218 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 219 struct page *page = alloc_page(GFP_KERNEL); 220 221 if (!page) { 222 err = -ENOMEM; 223 goto alloc_err_qpl; 224 } 225 226 rx->qpl_copy_pool[j].page = page; 227 rx->qpl_copy_pool[j].page_offset = 0; 228 rx->qpl_copy_pool[j].page_address = page_address(page); 229 rx->qpl_copy_pool[j].buf_size = rx->packet_buffer_size; 230 231 /* The page already has 1 ref. */ 232 page_ref_add(page, INT_MAX - 1); 233 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 234 } 235 } 236 237 return slots; 238 239 alloc_err_qpl: 240 /* Fully free the copy pool pages. */ 241 while (j--) { 242 page_ref_sub(rx->qpl_copy_pool[j].page, 243 rx->qpl_copy_pool[j].pagecnt_bias - 1); 244 put_page(rx->qpl_copy_pool[j].page); 245 } 246 247 /* Do not fully free QPL pages - only remove the bias added in this 248 * function with gve_setup_rx_buffer. 249 */ 250 while (i--) 251 page_ref_sub(rx->data.page_info[i].page, 252 rx->data.page_info[i].pagecnt_bias - 1); 253 254 return err; 255 256 alloc_err_rda: 257 while (i--) 258 gve_rx_free_buffer(&priv->pdev->dev, 259 &rx->data.page_info[i], 260 &rx->data.data_ring[i]); 261 return err; 262 } 263 264 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx) 265 { 266 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 267 268 gve_rx_add_to_block(priv, idx); 269 gve_add_napi(priv, ntfy_idx, gve_napi_poll); 270 } 271 272 int gve_rx_alloc_ring_gqi(struct gve_priv *priv, 273 struct gve_rx_alloc_rings_cfg *cfg, 274 struct gve_rx_ring *rx, 275 int idx) 276 { 277 struct device *hdev = &priv->pdev->dev; 278 u32 slots = cfg->ring_size; 279 int filled_pages; 280 int qpl_page_cnt; 281 u32 qpl_id = 0; 282 size_t bytes; 283 int err; 284 285 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 286 /* Make sure everything is zeroed to start with */ 287 memset(rx, 0, sizeof(*rx)); 288 289 rx->gve = priv; 290 rx->q_num = idx; 291 rx->packet_buffer_size = cfg->packet_buffer_size; 292 293 rx->mask = slots - 1; 294 rx->data.raw_addressing = cfg->raw_addressing; 295 296 /* alloc rx data ring */ 297 bytes = sizeof(*rx->data.data_ring) * slots; 298 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 299 &rx->data.data_bus, 300 GFP_KERNEL); 301 if (!rx->data.data_ring) 302 return -ENOMEM; 303 304 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 305 rx->qpl_copy_pool_head = 0; 306 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 307 sizeof(rx->qpl_copy_pool[0]), 308 GFP_KERNEL); 309 310 if (!rx->qpl_copy_pool) { 311 err = -ENOMEM; 312 goto abort_with_slots; 313 } 314 315 if (!rx->data.raw_addressing) { 316 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 317 qpl_page_cnt = cfg->ring_size; 318 319 rx->data.qpl = gve_alloc_queue_page_list(priv, qpl_id, 320 qpl_page_cnt); 321 if (!rx->data.qpl) { 322 err = -ENOMEM; 323 goto abort_with_copy_pool; 324 } 325 } 326 327 filled_pages = gve_rx_prefill_pages(rx, cfg); 328 if (filled_pages < 0) { 329 err = -ENOMEM; 330 goto abort_with_qpl; 331 } 332 rx->fill_cnt = filled_pages; 333 /* Ensure data ring slots (packet buffers) are visible. */ 334 dma_wmb(); 335 336 /* Alloc gve_queue_resources */ 337 rx->q_resources = 338 dma_alloc_coherent(hdev, 339 sizeof(*rx->q_resources), 340 &rx->q_resources_bus, 341 GFP_KERNEL); 342 if (!rx->q_resources) { 343 err = -ENOMEM; 344 goto abort_filled; 345 } 346 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 347 (unsigned long)rx->data.data_bus); 348 349 /* alloc rx desc ring */ 350 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 351 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 352 GFP_KERNEL); 353 if (!rx->desc.desc_ring) { 354 err = -ENOMEM; 355 goto abort_with_q_resources; 356 } 357 rx->db_threshold = slots / 2; 358 gve_rx_init_ring_state_gqi(rx); 359 360 gve_rx_ctx_clear(&rx->ctx); 361 362 return 0; 363 364 abort_with_q_resources: 365 dma_free_coherent(hdev, sizeof(*rx->q_resources), 366 rx->q_resources, rx->q_resources_bus); 367 rx->q_resources = NULL; 368 abort_filled: 369 gve_rx_unfill_pages(priv, rx, cfg); 370 abort_with_qpl: 371 if (!rx->data.raw_addressing) { 372 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id); 373 rx->data.qpl = NULL; 374 } 375 abort_with_copy_pool: 376 kvfree(rx->qpl_copy_pool); 377 rx->qpl_copy_pool = NULL; 378 abort_with_slots: 379 bytes = sizeof(*rx->data.data_ring) * slots; 380 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 381 rx->data.data_ring = NULL; 382 383 return err; 384 } 385 386 int gve_rx_alloc_rings_gqi(struct gve_priv *priv, 387 struct gve_rx_alloc_rings_cfg *cfg) 388 { 389 struct gve_rx_ring *rx; 390 int err = 0; 391 int i, j; 392 393 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring), 394 GFP_KERNEL); 395 if (!rx) 396 return -ENOMEM; 397 398 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) { 399 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i); 400 if (err) { 401 netif_err(priv, drv, priv->dev, 402 "Failed to alloc rx ring=%d: err=%d\n", 403 i, err); 404 goto cleanup; 405 } 406 } 407 408 cfg->rx = rx; 409 return 0; 410 411 cleanup: 412 for (j = 0; j < i; j++) 413 gve_rx_free_ring_gqi(priv, &rx[j], cfg); 414 kvfree(rx); 415 return err; 416 } 417 418 void gve_rx_free_rings_gqi(struct gve_priv *priv, 419 struct gve_rx_alloc_rings_cfg *cfg) 420 { 421 struct gve_rx_ring *rx = cfg->rx; 422 int i; 423 424 if (!rx) 425 return; 426 427 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) 428 gve_rx_free_ring_gqi(priv, &rx[i], cfg); 429 430 kvfree(rx); 431 cfg->rx = NULL; 432 } 433 434 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 435 { 436 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 437 438 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 439 } 440 441 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 442 { 443 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 444 return PKT_HASH_TYPE_L4; 445 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 446 return PKT_HASH_TYPE_L3; 447 return PKT_HASH_TYPE_L2; 448 } 449 450 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 451 struct gve_rx_slot_page_info *page_info, 452 unsigned int truesize, u16 len, 453 struct gve_rx_ctx *ctx) 454 { 455 u32 offset = page_info->page_offset + page_info->pad; 456 struct sk_buff *skb = ctx->skb_tail; 457 int num_frags = 0; 458 459 if (!skb) { 460 skb = napi_get_frags(napi); 461 if (unlikely(!skb)) 462 return NULL; 463 464 ctx->skb_head = skb; 465 ctx->skb_tail = skb; 466 } else { 467 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 468 if (num_frags == MAX_SKB_FRAGS) { 469 skb = napi_alloc_skb(napi, 0); 470 if (!skb) 471 return NULL; 472 473 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 474 // which is why we do not need to chain by using skb->next 475 skb_shinfo(ctx->skb_tail)->frag_list = skb; 476 477 ctx->skb_tail = skb; 478 num_frags = 0; 479 } 480 } 481 482 if (skb != ctx->skb_head) { 483 ctx->skb_head->len += len; 484 ctx->skb_head->data_len += len; 485 ctx->skb_head->truesize += truesize; 486 } 487 skb_add_rx_frag(skb, num_frags, page_info->page, 488 offset, len, truesize); 489 490 return ctx->skb_head; 491 } 492 493 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 494 { 495 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); 496 497 /* "flip" to other packet buffer on this page */ 498 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 499 *(slot_addr) ^= offset; 500 } 501 502 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 503 { 504 int pagecount = page_count(page_info->page); 505 506 /* This page is not being used by any SKBs - reuse */ 507 if (pagecount == page_info->pagecnt_bias) 508 return 1; 509 /* This page is still being used by an SKB - we can't reuse */ 510 else if (pagecount > page_info->pagecnt_bias) 511 return 0; 512 WARN(pagecount < page_info->pagecnt_bias, 513 "Pagecount should never be less than the bias."); 514 return -1; 515 } 516 517 static struct sk_buff * 518 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 519 struct gve_rx_slot_page_info *page_info, u16 len, 520 struct napi_struct *napi, 521 union gve_rx_data_slot *data_slot, 522 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 523 { 524 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 525 526 if (!skb) 527 return NULL; 528 529 /* Optimistically stop the kernel from freeing the page. 530 * We will check again in refill to determine if we need to alloc a 531 * new page. 532 */ 533 gve_dec_pagecnt_bias(page_info); 534 535 return skb; 536 } 537 538 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 539 struct gve_rx_slot_page_info *page_info, 540 u16 len, struct napi_struct *napi) 541 { 542 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 543 void *src = page_info->page_address + page_info->page_offset; 544 struct gve_rx_slot_page_info *copy_page_info; 545 struct gve_rx_ctx *ctx = &rx->ctx; 546 bool alloc_page = false; 547 struct sk_buff *skb; 548 void *dst; 549 550 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 551 if (!copy_page_info->can_flip) { 552 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 553 554 if (unlikely(recycle < 0)) { 555 gve_schedule_reset(rx->gve); 556 return NULL; 557 } 558 alloc_page = !recycle; 559 } 560 561 if (alloc_page) { 562 struct gve_rx_slot_page_info alloc_page_info; 563 struct page *page; 564 565 /* The least recently used page turned out to be 566 * still in use by the kernel. Ignoring it and moving 567 * on alleviates head-of-line blocking. 568 */ 569 rx->qpl_copy_pool_head++; 570 571 page = alloc_page(GFP_ATOMIC); 572 if (!page) 573 return NULL; 574 575 alloc_page_info.page = page; 576 alloc_page_info.page_offset = 0; 577 alloc_page_info.page_address = page_address(page); 578 alloc_page_info.pad = page_info->pad; 579 580 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 581 skb = gve_rx_add_frags(napi, &alloc_page_info, 582 PAGE_SIZE, 583 len, ctx); 584 585 u64_stats_update_begin(&rx->statss); 586 rx->rx_frag_copy_cnt++; 587 rx->rx_frag_alloc_cnt++; 588 u64_stats_update_end(&rx->statss); 589 590 return skb; 591 } 592 593 dst = copy_page_info->page_address + copy_page_info->page_offset; 594 memcpy(dst, src, page_info->pad + len); 595 copy_page_info->pad = page_info->pad; 596 597 skb = gve_rx_add_frags(napi, copy_page_info, 598 copy_page_info->buf_size, len, ctx); 599 if (unlikely(!skb)) 600 return NULL; 601 602 gve_dec_pagecnt_bias(copy_page_info); 603 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 604 605 if (copy_page_info->can_flip) { 606 /* We have used both halves of this copy page, it 607 * is time for it to go to the back of the queue. 608 */ 609 copy_page_info->can_flip = false; 610 rx->qpl_copy_pool_head++; 611 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 612 } else { 613 copy_page_info->can_flip = true; 614 } 615 616 u64_stats_update_begin(&rx->statss); 617 rx->rx_frag_copy_cnt++; 618 u64_stats_update_end(&rx->statss); 619 620 return skb; 621 } 622 623 static struct sk_buff * 624 gve_rx_qpl(struct device *dev, struct net_device *netdev, 625 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 626 u16 len, struct napi_struct *napi, 627 union gve_rx_data_slot *data_slot) 628 { 629 struct gve_rx_ctx *ctx = &rx->ctx; 630 struct sk_buff *skb; 631 632 /* if raw_addressing mode is not enabled gvnic can only receive into 633 * registered segments. If the buffer can't be recycled, our only 634 * choice is to copy the data out of it so that we can return it to the 635 * device. 636 */ 637 if (page_info->can_flip) { 638 skb = gve_rx_add_frags(napi, page_info, page_info->buf_size, 639 len, ctx); 640 /* No point in recycling if we didn't get the skb */ 641 if (skb) { 642 /* Make sure that the page isn't freed. */ 643 gve_dec_pagecnt_bias(page_info); 644 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 645 } 646 } else { 647 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 648 } 649 return skb; 650 } 651 652 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 653 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 654 u16 len, union gve_rx_data_slot *data_slot, 655 bool is_only_frag) 656 { 657 struct net_device *netdev = priv->dev; 658 struct gve_rx_ctx *ctx = &rx->ctx; 659 struct sk_buff *skb = NULL; 660 661 if (len <= priv->rx_copybreak && is_only_frag) { 662 /* Just copy small packets */ 663 skb = gve_rx_copy(netdev, napi, page_info, len); 664 if (skb) { 665 u64_stats_update_begin(&rx->statss); 666 rx->rx_copied_pkt++; 667 rx->rx_frag_copy_cnt++; 668 rx->rx_copybreak_pkt++; 669 u64_stats_update_end(&rx->statss); 670 } 671 } else { 672 int recycle = gve_rx_can_recycle_buffer(page_info); 673 674 if (unlikely(recycle < 0)) { 675 gve_schedule_reset(priv); 676 return NULL; 677 } 678 page_info->can_flip = recycle; 679 if (page_info->can_flip) { 680 u64_stats_update_begin(&rx->statss); 681 rx->rx_frag_flip_cnt++; 682 u64_stats_update_end(&rx->statss); 683 } 684 685 if (rx->data.raw_addressing) { 686 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 687 page_info, len, napi, 688 data_slot, 689 page_info->buf_size, ctx); 690 } else { 691 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 692 page_info, len, napi, data_slot); 693 } 694 } 695 return skb; 696 } 697 698 static int gve_xsk_pool_redirect(struct net_device *dev, 699 struct gve_rx_ring *rx, 700 void *data, int len, 701 struct bpf_prog *xdp_prog) 702 { 703 struct xdp_buff *xdp; 704 int err; 705 706 if (rx->xsk_pool->frame_len < len) 707 return -E2BIG; 708 xdp = xsk_buff_alloc(rx->xsk_pool); 709 if (!xdp) { 710 u64_stats_update_begin(&rx->statss); 711 rx->xdp_alloc_fails++; 712 u64_stats_update_end(&rx->statss); 713 return -ENOMEM; 714 } 715 xdp->data_end = xdp->data + len; 716 memcpy(xdp->data, data, len); 717 err = xdp_do_redirect(dev, xdp, xdp_prog); 718 if (err) 719 xsk_buff_free(xdp); 720 return err; 721 } 722 723 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 724 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 725 { 726 int total_len, len = orig->data_end - orig->data; 727 int headroom = XDP_PACKET_HEADROOM; 728 struct xdp_buff new; 729 void *frame; 730 int err; 731 732 if (rx->xsk_pool) 733 return gve_xsk_pool_redirect(dev, rx, orig->data, 734 len, xdp_prog); 735 736 total_len = headroom + SKB_DATA_ALIGN(len) + 737 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 738 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 739 if (!frame) { 740 u64_stats_update_begin(&rx->statss); 741 rx->xdp_alloc_fails++; 742 u64_stats_update_end(&rx->statss); 743 return -ENOMEM; 744 } 745 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 746 xdp_prepare_buff(&new, frame, headroom, len, false); 747 memcpy(new.data, orig->data, len); 748 749 err = xdp_do_redirect(dev, &new, xdp_prog); 750 if (err) 751 page_frag_free(frame); 752 753 return err; 754 } 755 756 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 757 struct xdp_buff *xdp, struct bpf_prog *xprog, 758 int xdp_act) 759 { 760 struct gve_tx_ring *tx; 761 int tx_qid; 762 int err; 763 764 switch (xdp_act) { 765 case XDP_ABORTED: 766 case XDP_DROP: 767 default: 768 break; 769 case XDP_TX: 770 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 771 tx = &priv->tx[tx_qid]; 772 spin_lock(&tx->xdp_lock); 773 err = gve_xdp_xmit_one(priv, tx, xdp->data, 774 xdp->data_end - xdp->data, NULL); 775 spin_unlock(&tx->xdp_lock); 776 777 if (unlikely(err)) { 778 u64_stats_update_begin(&rx->statss); 779 rx->xdp_tx_errors++; 780 u64_stats_update_end(&rx->statss); 781 } 782 break; 783 case XDP_REDIRECT: 784 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 785 786 if (unlikely(err)) { 787 u64_stats_update_begin(&rx->statss); 788 rx->xdp_redirect_errors++; 789 u64_stats_update_end(&rx->statss); 790 } 791 break; 792 } 793 u64_stats_update_begin(&rx->statss); 794 if ((u32)xdp_act < GVE_XDP_ACTIONS) 795 rx->xdp_actions[xdp_act]++; 796 u64_stats_update_end(&rx->statss); 797 } 798 799 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 800 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 801 struct gve_rx_desc *desc, u32 idx, 802 struct gve_rx_cnts *cnts) 803 { 804 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 805 struct gve_rx_slot_page_info *page_info; 806 u16 frag_size = be16_to_cpu(desc->len); 807 struct gve_rx_ctx *ctx = &rx->ctx; 808 union gve_rx_data_slot *data_slot; 809 struct gve_priv *priv = rx->gve; 810 struct sk_buff *skb = NULL; 811 struct bpf_prog *xprog; 812 struct xdp_buff xdp; 813 dma_addr_t page_bus; 814 void *va; 815 816 u16 len = frag_size; 817 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 818 bool is_first_frag = ctx->frag_cnt == 0; 819 820 bool is_only_frag = is_first_frag && is_last_frag; 821 822 if (unlikely(ctx->drop_pkt)) 823 goto finish_frag; 824 825 if (desc->flags_seq & GVE_RXF_ERR) { 826 ctx->drop_pkt = true; 827 cnts->desc_err_pkt_cnt++; 828 napi_free_frags(napi); 829 goto finish_frag; 830 } 831 832 if (unlikely(frag_size > rx->packet_buffer_size)) { 833 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 834 frag_size, rx->packet_buffer_size); 835 ctx->drop_pkt = true; 836 napi_free_frags(napi); 837 gve_schedule_reset(rx->gve); 838 goto finish_frag; 839 } 840 841 /* Prefetch two packet buffers ahead, we will need it soon. */ 842 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 843 va = page_info->page_address + page_info->page_offset; 844 prefetch(page_info->page); /* Kernel page struct. */ 845 prefetch(va); /* Packet header. */ 846 prefetch(va + 64); /* Next cacheline too. */ 847 848 page_info = &rx->data.page_info[idx]; 849 data_slot = &rx->data.data_ring[idx]; 850 page_bus = (rx->data.raw_addressing) ? 851 be64_to_cpu(data_slot->addr) - page_info->page_offset : 852 rx->data.qpl->page_buses[idx]; 853 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 854 PAGE_SIZE, DMA_FROM_DEVICE); 855 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 856 len -= page_info->pad; 857 frag_size -= page_info->pad; 858 859 xprog = READ_ONCE(priv->xdp_prog); 860 if (xprog && is_only_frag) { 861 void *old_data; 862 int xdp_act; 863 864 xdp_init_buff(&xdp, page_info->buf_size, &rx->xdp_rxq); 865 xdp_prepare_buff(&xdp, page_info->page_address + 866 page_info->page_offset, GVE_RX_PAD, 867 len, false); 868 old_data = xdp.data; 869 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 870 if (xdp_act != XDP_PASS) { 871 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 872 ctx->total_size += frag_size; 873 goto finish_ok_pkt; 874 } 875 876 page_info->pad += xdp.data - old_data; 877 len = xdp.data_end - xdp.data; 878 879 u64_stats_update_begin(&rx->statss); 880 rx->xdp_actions[XDP_PASS]++; 881 u64_stats_update_end(&rx->statss); 882 } 883 884 skb = gve_rx_skb(priv, rx, page_info, napi, len, 885 data_slot, is_only_frag); 886 if (!skb) { 887 u64_stats_update_begin(&rx->statss); 888 rx->rx_skb_alloc_fail++; 889 u64_stats_update_end(&rx->statss); 890 891 napi_free_frags(napi); 892 ctx->drop_pkt = true; 893 goto finish_frag; 894 } 895 ctx->total_size += frag_size; 896 897 if (is_first_frag) { 898 if (likely(feat & NETIF_F_RXCSUM)) { 899 /* NIC passes up the partial sum */ 900 if (desc->csum) 901 skb->ip_summed = CHECKSUM_COMPLETE; 902 else 903 skb->ip_summed = CHECKSUM_NONE; 904 skb->csum = csum_unfold(desc->csum); 905 } 906 907 /* parse flags & pass relevant info up */ 908 if (likely(feat & NETIF_F_RXHASH) && 909 gve_needs_rss(desc->flags_seq)) 910 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 911 gve_rss_type(desc->flags_seq)); 912 } 913 914 if (is_last_frag) { 915 skb_record_rx_queue(skb, rx->q_num); 916 if (skb_is_nonlinear(skb)) 917 napi_gro_frags(napi); 918 else 919 napi_gro_receive(napi, skb); 920 goto finish_ok_pkt; 921 } 922 923 goto finish_frag; 924 925 finish_ok_pkt: 926 cnts->ok_pkt_bytes += ctx->total_size; 927 cnts->ok_pkt_cnt++; 928 finish_frag: 929 ctx->frag_cnt++; 930 if (is_last_frag) { 931 cnts->total_pkt_cnt++; 932 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 933 gve_rx_ctx_clear(ctx); 934 } 935 } 936 937 bool gve_rx_work_pending(struct gve_rx_ring *rx) 938 { 939 struct gve_rx_desc *desc; 940 __be16 flags_seq; 941 u32 next_idx; 942 943 next_idx = rx->cnt & rx->mask; 944 desc = rx->desc.desc_ring + next_idx; 945 946 flags_seq = desc->flags_seq; 947 948 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 949 } 950 951 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 952 { 953 int refill_target = rx->mask + 1; 954 u32 fill_cnt = rx->fill_cnt; 955 956 while (fill_cnt - rx->cnt < refill_target) { 957 struct gve_rx_slot_page_info *page_info; 958 u32 idx = fill_cnt & rx->mask; 959 960 page_info = &rx->data.page_info[idx]; 961 if (page_info->can_flip) { 962 /* The other half of the page is free because it was 963 * free when we processed the descriptor. Flip to it. 964 */ 965 union gve_rx_data_slot *data_slot = 966 &rx->data.data_ring[idx]; 967 968 gve_rx_flip_buff(page_info, &data_slot->addr); 969 page_info->can_flip = 0; 970 } else { 971 /* It is possible that the networking stack has already 972 * finished processing all outstanding packets in the buffer 973 * and it can be reused. 974 * Flipping is unnecessary here - if the networking stack still 975 * owns half the page it is impossible to tell which half. Either 976 * the whole page is free or it needs to be replaced. 977 */ 978 int recycle = gve_rx_can_recycle_buffer(page_info); 979 980 if (recycle < 0) { 981 if (!rx->data.raw_addressing) 982 gve_schedule_reset(priv); 983 return false; 984 } 985 if (!recycle) { 986 /* We can't reuse the buffer - alloc a new one*/ 987 union gve_rx_data_slot *data_slot = 988 &rx->data.data_ring[idx]; 989 struct device *dev = &priv->pdev->dev; 990 gve_rx_free_buffer(dev, page_info, data_slot); 991 page_info->page = NULL; 992 if (gve_rx_alloc_buffer(priv, dev, page_info, 993 data_slot, rx)) { 994 break; 995 } 996 } 997 } 998 fill_cnt++; 999 } 1000 rx->fill_cnt = fill_cnt; 1001 return true; 1002 } 1003 1004 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 1005 netdev_features_t feat) 1006 { 1007 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 1008 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 1009 struct gve_rx_ctx *ctx = &rx->ctx; 1010 struct gve_priv *priv = rx->gve; 1011 struct gve_rx_cnts cnts = {0}; 1012 struct gve_rx_desc *next_desc; 1013 u32 idx = rx->cnt & rx->mask; 1014 u32 work_done = 0; 1015 1016 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 1017 1018 // Exceed budget only if (and till) the inflight packet is consumed. 1019 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 1020 (work_done < budget || ctx->frag_cnt)) { 1021 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 1022 prefetch(next_desc); 1023 1024 gve_rx(rx, feat, desc, idx, &cnts); 1025 1026 rx->cnt++; 1027 idx = rx->cnt & rx->mask; 1028 desc = &rx->desc.desc_ring[idx]; 1029 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 1030 work_done++; 1031 } 1032 1033 // The device will only send whole packets. 1034 if (unlikely(ctx->frag_cnt)) { 1035 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 1036 1037 napi_free_frags(napi); 1038 gve_rx_ctx_clear(&rx->ctx); 1039 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 1040 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 1041 gve_schedule_reset(rx->gve); 1042 } 1043 1044 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 1045 return 0; 1046 1047 if (work_done) { 1048 u64_stats_update_begin(&rx->statss); 1049 rx->rpackets += cnts.ok_pkt_cnt; 1050 rx->rbytes += cnts.ok_pkt_bytes; 1051 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 1052 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 1053 u64_stats_update_end(&rx->statss); 1054 } 1055 1056 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1057 gve_xdp_tx_flush(priv, rx->q_num); 1058 1059 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1060 xdp_do_flush(); 1061 1062 /* restock ring slots */ 1063 if (!rx->data.raw_addressing) { 1064 /* In QPL mode buffs are refilled as the desc are processed */ 1065 rx->fill_cnt += work_done; 1066 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1067 /* In raw addressing mode buffs are only refilled if the avail 1068 * falls below a threshold. 1069 */ 1070 if (!gve_rx_refill_buffers(priv, rx)) 1071 return 0; 1072 1073 /* If we were not able to completely refill buffers, we'll want 1074 * to schedule this queue for work again to refill buffers. 1075 */ 1076 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1077 gve_rx_write_doorbell(priv, rx); 1078 return budget; 1079 } 1080 } 1081 1082 gve_rx_write_doorbell(priv, rx); 1083 return cnts.total_pkt_cnt; 1084 } 1085 1086 int gve_rx_poll(struct gve_notify_block *block, int budget) 1087 { 1088 struct gve_rx_ring *rx = block->rx; 1089 netdev_features_t feat; 1090 int work_done = 0; 1091 1092 feat = block->napi.dev->features; 1093 1094 if (budget > 0) 1095 work_done = gve_clean_rx_done(rx, budget, feat); 1096 1097 return work_done; 1098 } 1099