1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, 27 struct gve_rx_ring *rx, 28 struct gve_rx_alloc_rings_cfg *cfg) 29 { 30 u32 slots = rx->mask + 1; 31 int i; 32 33 if (!rx->data.page_info) 34 return; 35 36 if (rx->data.raw_addressing) { 37 for (i = 0; i < slots; i++) 38 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 39 &rx->data.data_ring[i]); 40 } else { 41 for (i = 0; i < slots; i++) 42 page_ref_sub(rx->data.page_info[i].page, 43 rx->data.page_info[i].pagecnt_bias - 1); 44 45 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 46 page_ref_sub(rx->qpl_copy_pool[i].page, 47 rx->qpl_copy_pool[i].pagecnt_bias - 1); 48 put_page(rx->qpl_copy_pool[i].page); 49 } 50 } 51 kvfree(rx->data.page_info); 52 rx->data.page_info = NULL; 53 } 54 55 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 56 { 57 ctx->skb_head = NULL; 58 ctx->skb_tail = NULL; 59 ctx->total_size = 0; 60 ctx->frag_cnt = 0; 61 ctx->drop_pkt = false; 62 } 63 64 static void gve_rx_init_ring_state_gqi(struct gve_rx_ring *rx) 65 { 66 rx->desc.seqno = 1; 67 rx->cnt = 0; 68 gve_rx_ctx_clear(&rx->ctx); 69 } 70 71 static void gve_rx_reset_ring_gqi(struct gve_priv *priv, int idx) 72 { 73 struct gve_rx_ring *rx = &priv->rx[idx]; 74 const u32 slots = priv->rx_desc_cnt; 75 size_t size; 76 77 /* Reset desc ring */ 78 if (rx->desc.desc_ring) { 79 size = slots * sizeof(rx->desc.desc_ring[0]); 80 memset(rx->desc.desc_ring, 0, size); 81 } 82 83 /* Reset q_resources */ 84 if (rx->q_resources) 85 memset(rx->q_resources, 0, sizeof(*rx->q_resources)); 86 87 gve_rx_init_ring_state_gqi(rx); 88 } 89 90 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx) 91 { 92 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 93 94 if (!gve_rx_was_added_to_block(priv, idx)) 95 return; 96 97 gve_remove_napi(priv, ntfy_idx); 98 gve_rx_remove_from_block(priv, idx); 99 gve_rx_reset_ring_gqi(priv, idx); 100 } 101 102 void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx, 103 struct gve_rx_alloc_rings_cfg *cfg) 104 { 105 struct device *dev = &priv->pdev->dev; 106 u32 slots = rx->mask + 1; 107 int idx = rx->q_num; 108 size_t bytes; 109 u32 qpl_id; 110 111 if (rx->desc.desc_ring) { 112 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 113 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 114 rx->desc.desc_ring = NULL; 115 } 116 117 if (rx->q_resources) { 118 dma_free_coherent(dev, sizeof(*rx->q_resources), 119 rx->q_resources, rx->q_resources_bus); 120 rx->q_resources = NULL; 121 } 122 123 gve_rx_unfill_pages(priv, rx, cfg); 124 125 if (rx->data.data_ring) { 126 bytes = sizeof(*rx->data.data_ring) * slots; 127 dma_free_coherent(dev, bytes, rx->data.data_ring, 128 rx->data.data_bus); 129 rx->data.data_ring = NULL; 130 } 131 132 kvfree(rx->qpl_copy_pool); 133 rx->qpl_copy_pool = NULL; 134 135 if (rx->data.qpl) { 136 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, idx); 137 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id); 138 rx->data.qpl = NULL; 139 } 140 141 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 142 } 143 144 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 145 dma_addr_t addr, struct page *page, __be64 *slot_addr) 146 { 147 page_info->page = page; 148 page_info->page_offset = 0; 149 page_info->page_address = page_address(page); 150 *slot_addr = cpu_to_be64(addr); 151 /* The page already has 1 ref */ 152 page_ref_add(page, INT_MAX - 1); 153 page_info->pagecnt_bias = INT_MAX; 154 } 155 156 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 157 struct gve_rx_slot_page_info *page_info, 158 union gve_rx_data_slot *data_slot, 159 struct gve_rx_ring *rx) 160 { 161 struct page *page; 162 dma_addr_t dma; 163 int err; 164 165 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 166 GFP_ATOMIC); 167 if (err) { 168 u64_stats_update_begin(&rx->statss); 169 rx->rx_buf_alloc_fail++; 170 u64_stats_update_end(&rx->statss); 171 return err; 172 } 173 174 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 175 return 0; 176 } 177 178 static int gve_rx_prefill_pages(struct gve_rx_ring *rx, 179 struct gve_rx_alloc_rings_cfg *cfg) 180 { 181 struct gve_priv *priv = rx->gve; 182 u32 slots; 183 int err; 184 int i; 185 int j; 186 187 /* Allocate one page per Rx queue slot. Each page is split into two 188 * packet buffers, when possible we "page flip" between the two. 189 */ 190 slots = rx->mask + 1; 191 192 rx->data.page_info = kvzalloc(slots * 193 sizeof(*rx->data.page_info), GFP_KERNEL); 194 if (!rx->data.page_info) 195 return -ENOMEM; 196 197 for (i = 0; i < slots; i++) { 198 if (!rx->data.raw_addressing) { 199 struct page *page = rx->data.qpl->pages[i]; 200 dma_addr_t addr = i * PAGE_SIZE; 201 202 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 203 &rx->data.data_ring[i].qpl_offset); 204 continue; 205 } 206 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, 207 &rx->data.page_info[i], 208 &rx->data.data_ring[i], rx); 209 if (err) 210 goto alloc_err_rda; 211 } 212 213 if (!rx->data.raw_addressing) { 214 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 215 struct page *page = alloc_page(GFP_KERNEL); 216 217 if (!page) { 218 err = -ENOMEM; 219 goto alloc_err_qpl; 220 } 221 222 rx->qpl_copy_pool[j].page = page; 223 rx->qpl_copy_pool[j].page_offset = 0; 224 rx->qpl_copy_pool[j].page_address = page_address(page); 225 226 /* The page already has 1 ref. */ 227 page_ref_add(page, INT_MAX - 1); 228 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 229 } 230 } 231 232 return slots; 233 234 alloc_err_qpl: 235 /* Fully free the copy pool pages. */ 236 while (j--) { 237 page_ref_sub(rx->qpl_copy_pool[j].page, 238 rx->qpl_copy_pool[j].pagecnt_bias - 1); 239 put_page(rx->qpl_copy_pool[j].page); 240 } 241 242 /* Do not fully free QPL pages - only remove the bias added in this 243 * function with gve_setup_rx_buffer. 244 */ 245 while (i--) 246 page_ref_sub(rx->data.page_info[i].page, 247 rx->data.page_info[i].pagecnt_bias - 1); 248 249 return err; 250 251 alloc_err_rda: 252 while (i--) 253 gve_rx_free_buffer(&priv->pdev->dev, 254 &rx->data.page_info[i], 255 &rx->data.data_ring[i]); 256 return err; 257 } 258 259 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx) 260 { 261 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 262 263 gve_rx_add_to_block(priv, idx); 264 gve_add_napi(priv, ntfy_idx, gve_napi_poll); 265 } 266 267 int gve_rx_alloc_ring_gqi(struct gve_priv *priv, 268 struct gve_rx_alloc_rings_cfg *cfg, 269 struct gve_rx_ring *rx, 270 int idx) 271 { 272 struct device *hdev = &priv->pdev->dev; 273 u32 slots = cfg->ring_size; 274 int filled_pages; 275 int qpl_page_cnt; 276 u32 qpl_id = 0; 277 size_t bytes; 278 int err; 279 280 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 281 /* Make sure everything is zeroed to start with */ 282 memset(rx, 0, sizeof(*rx)); 283 284 rx->gve = priv; 285 rx->q_num = idx; 286 287 rx->mask = slots - 1; 288 rx->data.raw_addressing = cfg->raw_addressing; 289 290 /* alloc rx data ring */ 291 bytes = sizeof(*rx->data.data_ring) * slots; 292 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 293 &rx->data.data_bus, 294 GFP_KERNEL); 295 if (!rx->data.data_ring) 296 return -ENOMEM; 297 298 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 299 rx->qpl_copy_pool_head = 0; 300 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 301 sizeof(rx->qpl_copy_pool[0]), 302 GFP_KERNEL); 303 304 if (!rx->qpl_copy_pool) { 305 err = -ENOMEM; 306 goto abort_with_slots; 307 } 308 309 if (!rx->data.raw_addressing) { 310 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 311 qpl_page_cnt = cfg->ring_size; 312 313 rx->data.qpl = gve_alloc_queue_page_list(priv, qpl_id, 314 qpl_page_cnt); 315 if (!rx->data.qpl) { 316 err = -ENOMEM; 317 goto abort_with_copy_pool; 318 } 319 } 320 321 filled_pages = gve_rx_prefill_pages(rx, cfg); 322 if (filled_pages < 0) { 323 err = -ENOMEM; 324 goto abort_with_qpl; 325 } 326 rx->fill_cnt = filled_pages; 327 /* Ensure data ring slots (packet buffers) are visible. */ 328 dma_wmb(); 329 330 /* Alloc gve_queue_resources */ 331 rx->q_resources = 332 dma_alloc_coherent(hdev, 333 sizeof(*rx->q_resources), 334 &rx->q_resources_bus, 335 GFP_KERNEL); 336 if (!rx->q_resources) { 337 err = -ENOMEM; 338 goto abort_filled; 339 } 340 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 341 (unsigned long)rx->data.data_bus); 342 343 /* alloc rx desc ring */ 344 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; 345 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 346 GFP_KERNEL); 347 if (!rx->desc.desc_ring) { 348 err = -ENOMEM; 349 goto abort_with_q_resources; 350 } 351 rx->db_threshold = slots / 2; 352 gve_rx_init_ring_state_gqi(rx); 353 354 rx->packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE; 355 gve_rx_ctx_clear(&rx->ctx); 356 357 return 0; 358 359 abort_with_q_resources: 360 dma_free_coherent(hdev, sizeof(*rx->q_resources), 361 rx->q_resources, rx->q_resources_bus); 362 rx->q_resources = NULL; 363 abort_filled: 364 gve_rx_unfill_pages(priv, rx, cfg); 365 abort_with_qpl: 366 if (!rx->data.raw_addressing) { 367 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id); 368 rx->data.qpl = NULL; 369 } 370 abort_with_copy_pool: 371 kvfree(rx->qpl_copy_pool); 372 rx->qpl_copy_pool = NULL; 373 abort_with_slots: 374 bytes = sizeof(*rx->data.data_ring) * slots; 375 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 376 rx->data.data_ring = NULL; 377 378 return err; 379 } 380 381 int gve_rx_alloc_rings_gqi(struct gve_priv *priv, 382 struct gve_rx_alloc_rings_cfg *cfg) 383 { 384 struct gve_rx_ring *rx; 385 int err = 0; 386 int i, j; 387 388 rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring), 389 GFP_KERNEL); 390 if (!rx) 391 return -ENOMEM; 392 393 for (i = 0; i < cfg->qcfg->num_queues; i++) { 394 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i); 395 if (err) { 396 netif_err(priv, drv, priv->dev, 397 "Failed to alloc rx ring=%d: err=%d\n", 398 i, err); 399 goto cleanup; 400 } 401 } 402 403 cfg->rx = rx; 404 return 0; 405 406 cleanup: 407 for (j = 0; j < i; j++) 408 gve_rx_free_ring_gqi(priv, &rx[j], cfg); 409 kvfree(rx); 410 return err; 411 } 412 413 void gve_rx_free_rings_gqi(struct gve_priv *priv, 414 struct gve_rx_alloc_rings_cfg *cfg) 415 { 416 struct gve_rx_ring *rx = cfg->rx; 417 int i; 418 419 if (!rx) 420 return; 421 422 for (i = 0; i < cfg->qcfg->num_queues; i++) 423 gve_rx_free_ring_gqi(priv, &rx[i], cfg); 424 425 kvfree(rx); 426 cfg->rx = NULL; 427 } 428 429 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 430 { 431 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 432 433 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 434 } 435 436 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 437 { 438 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 439 return PKT_HASH_TYPE_L4; 440 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 441 return PKT_HASH_TYPE_L3; 442 return PKT_HASH_TYPE_L2; 443 } 444 445 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 446 struct gve_rx_slot_page_info *page_info, 447 unsigned int truesize, u16 len, 448 struct gve_rx_ctx *ctx) 449 { 450 u32 offset = page_info->page_offset + page_info->pad; 451 struct sk_buff *skb = ctx->skb_tail; 452 int num_frags = 0; 453 454 if (!skb) { 455 skb = napi_get_frags(napi); 456 if (unlikely(!skb)) 457 return NULL; 458 459 ctx->skb_head = skb; 460 ctx->skb_tail = skb; 461 } else { 462 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 463 if (num_frags == MAX_SKB_FRAGS) { 464 skb = napi_alloc_skb(napi, 0); 465 if (!skb) 466 return NULL; 467 468 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 469 // which is why we do not need to chain by using skb->next 470 skb_shinfo(ctx->skb_tail)->frag_list = skb; 471 472 ctx->skb_tail = skb; 473 num_frags = 0; 474 } 475 } 476 477 if (skb != ctx->skb_head) { 478 ctx->skb_head->len += len; 479 ctx->skb_head->data_len += len; 480 ctx->skb_head->truesize += truesize; 481 } 482 skb_add_rx_frag(skb, num_frags, page_info->page, 483 offset, len, truesize); 484 485 return ctx->skb_head; 486 } 487 488 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 489 { 490 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); 491 492 /* "flip" to other packet buffer on this page */ 493 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 494 *(slot_addr) ^= offset; 495 } 496 497 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 498 { 499 int pagecount = page_count(page_info->page); 500 501 /* This page is not being used by any SKBs - reuse */ 502 if (pagecount == page_info->pagecnt_bias) 503 return 1; 504 /* This page is still being used by an SKB - we can't reuse */ 505 else if (pagecount > page_info->pagecnt_bias) 506 return 0; 507 WARN(pagecount < page_info->pagecnt_bias, 508 "Pagecount should never be less than the bias."); 509 return -1; 510 } 511 512 static struct sk_buff * 513 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 514 struct gve_rx_slot_page_info *page_info, u16 len, 515 struct napi_struct *napi, 516 union gve_rx_data_slot *data_slot, 517 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 518 { 519 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 520 521 if (!skb) 522 return NULL; 523 524 /* Optimistically stop the kernel from freeing the page. 525 * We will check again in refill to determine if we need to alloc a 526 * new page. 527 */ 528 gve_dec_pagecnt_bias(page_info); 529 530 return skb; 531 } 532 533 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 534 struct gve_rx_slot_page_info *page_info, 535 u16 len, struct napi_struct *napi) 536 { 537 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 538 void *src = page_info->page_address + page_info->page_offset; 539 struct gve_rx_slot_page_info *copy_page_info; 540 struct gve_rx_ctx *ctx = &rx->ctx; 541 bool alloc_page = false; 542 struct sk_buff *skb; 543 void *dst; 544 545 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 546 if (!copy_page_info->can_flip) { 547 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 548 549 if (unlikely(recycle < 0)) { 550 gve_schedule_reset(rx->gve); 551 return NULL; 552 } 553 alloc_page = !recycle; 554 } 555 556 if (alloc_page) { 557 struct gve_rx_slot_page_info alloc_page_info; 558 struct page *page; 559 560 /* The least recently used page turned out to be 561 * still in use by the kernel. Ignoring it and moving 562 * on alleviates head-of-line blocking. 563 */ 564 rx->qpl_copy_pool_head++; 565 566 page = alloc_page(GFP_ATOMIC); 567 if (!page) 568 return NULL; 569 570 alloc_page_info.page = page; 571 alloc_page_info.page_offset = 0; 572 alloc_page_info.page_address = page_address(page); 573 alloc_page_info.pad = page_info->pad; 574 575 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 576 skb = gve_rx_add_frags(napi, &alloc_page_info, 577 PAGE_SIZE, 578 len, ctx); 579 580 u64_stats_update_begin(&rx->statss); 581 rx->rx_frag_copy_cnt++; 582 rx->rx_frag_alloc_cnt++; 583 u64_stats_update_end(&rx->statss); 584 585 return skb; 586 } 587 588 dst = copy_page_info->page_address + copy_page_info->page_offset; 589 memcpy(dst, src, page_info->pad + len); 590 copy_page_info->pad = page_info->pad; 591 592 skb = gve_rx_add_frags(napi, copy_page_info, 593 rx->packet_buffer_size, len, ctx); 594 if (unlikely(!skb)) 595 return NULL; 596 597 gve_dec_pagecnt_bias(copy_page_info); 598 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; 599 600 if (copy_page_info->can_flip) { 601 /* We have used both halves of this copy page, it 602 * is time for it to go to the back of the queue. 603 */ 604 copy_page_info->can_flip = false; 605 rx->qpl_copy_pool_head++; 606 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 607 } else { 608 copy_page_info->can_flip = true; 609 } 610 611 u64_stats_update_begin(&rx->statss); 612 rx->rx_frag_copy_cnt++; 613 u64_stats_update_end(&rx->statss); 614 615 return skb; 616 } 617 618 static struct sk_buff * 619 gve_rx_qpl(struct device *dev, struct net_device *netdev, 620 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 621 u16 len, struct napi_struct *napi, 622 union gve_rx_data_slot *data_slot) 623 { 624 struct gve_rx_ctx *ctx = &rx->ctx; 625 struct sk_buff *skb; 626 627 /* if raw_addressing mode is not enabled gvnic can only receive into 628 * registered segments. If the buffer can't be recycled, our only 629 * choice is to copy the data out of it so that we can return it to the 630 * device. 631 */ 632 if (page_info->can_flip) { 633 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 634 /* No point in recycling if we didn't get the skb */ 635 if (skb) { 636 /* Make sure that the page isn't freed. */ 637 gve_dec_pagecnt_bias(page_info); 638 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 639 } 640 } else { 641 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 642 } 643 return skb; 644 } 645 646 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 647 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 648 u16 len, union gve_rx_data_slot *data_slot, 649 bool is_only_frag) 650 { 651 struct net_device *netdev = priv->dev; 652 struct gve_rx_ctx *ctx = &rx->ctx; 653 struct sk_buff *skb = NULL; 654 655 if (len <= priv->rx_copybreak && is_only_frag) { 656 /* Just copy small packets */ 657 skb = gve_rx_copy(netdev, napi, page_info, len); 658 if (skb) { 659 u64_stats_update_begin(&rx->statss); 660 rx->rx_copied_pkt++; 661 rx->rx_frag_copy_cnt++; 662 rx->rx_copybreak_pkt++; 663 u64_stats_update_end(&rx->statss); 664 } 665 } else { 666 int recycle = gve_rx_can_recycle_buffer(page_info); 667 668 if (unlikely(recycle < 0)) { 669 gve_schedule_reset(priv); 670 return NULL; 671 } 672 page_info->can_flip = recycle; 673 if (page_info->can_flip) { 674 u64_stats_update_begin(&rx->statss); 675 rx->rx_frag_flip_cnt++; 676 u64_stats_update_end(&rx->statss); 677 } 678 679 if (rx->data.raw_addressing) { 680 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 681 page_info, len, napi, 682 data_slot, 683 rx->packet_buffer_size, ctx); 684 } else { 685 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 686 page_info, len, napi, data_slot); 687 } 688 } 689 return skb; 690 } 691 692 static int gve_xsk_pool_redirect(struct net_device *dev, 693 struct gve_rx_ring *rx, 694 void *data, int len, 695 struct bpf_prog *xdp_prog) 696 { 697 struct xdp_buff *xdp; 698 int err; 699 700 if (rx->xsk_pool->frame_len < len) 701 return -E2BIG; 702 xdp = xsk_buff_alloc(rx->xsk_pool); 703 if (!xdp) { 704 u64_stats_update_begin(&rx->statss); 705 rx->xdp_alloc_fails++; 706 u64_stats_update_end(&rx->statss); 707 return -ENOMEM; 708 } 709 xdp->data_end = xdp->data + len; 710 memcpy(xdp->data, data, len); 711 err = xdp_do_redirect(dev, xdp, xdp_prog); 712 if (err) 713 xsk_buff_free(xdp); 714 return err; 715 } 716 717 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 718 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 719 { 720 int total_len, len = orig->data_end - orig->data; 721 int headroom = XDP_PACKET_HEADROOM; 722 struct xdp_buff new; 723 void *frame; 724 int err; 725 726 if (rx->xsk_pool) 727 return gve_xsk_pool_redirect(dev, rx, orig->data, 728 len, xdp_prog); 729 730 total_len = headroom + SKB_DATA_ALIGN(len) + 731 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 732 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 733 if (!frame) { 734 u64_stats_update_begin(&rx->statss); 735 rx->xdp_alloc_fails++; 736 u64_stats_update_end(&rx->statss); 737 return -ENOMEM; 738 } 739 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 740 xdp_prepare_buff(&new, frame, headroom, len, false); 741 memcpy(new.data, orig->data, len); 742 743 err = xdp_do_redirect(dev, &new, xdp_prog); 744 if (err) 745 page_frag_free(frame); 746 747 return err; 748 } 749 750 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 751 struct xdp_buff *xdp, struct bpf_prog *xprog, 752 int xdp_act) 753 { 754 struct gve_tx_ring *tx; 755 int tx_qid; 756 int err; 757 758 switch (xdp_act) { 759 case XDP_ABORTED: 760 case XDP_DROP: 761 default: 762 break; 763 case XDP_TX: 764 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 765 tx = &priv->tx[tx_qid]; 766 spin_lock(&tx->xdp_lock); 767 err = gve_xdp_xmit_one(priv, tx, xdp->data, 768 xdp->data_end - xdp->data, NULL); 769 spin_unlock(&tx->xdp_lock); 770 771 if (unlikely(err)) { 772 u64_stats_update_begin(&rx->statss); 773 rx->xdp_tx_errors++; 774 u64_stats_update_end(&rx->statss); 775 } 776 break; 777 case XDP_REDIRECT: 778 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 779 780 if (unlikely(err)) { 781 u64_stats_update_begin(&rx->statss); 782 rx->xdp_redirect_errors++; 783 u64_stats_update_end(&rx->statss); 784 } 785 break; 786 } 787 u64_stats_update_begin(&rx->statss); 788 if ((u32)xdp_act < GVE_XDP_ACTIONS) 789 rx->xdp_actions[xdp_act]++; 790 u64_stats_update_end(&rx->statss); 791 } 792 793 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 794 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 795 struct gve_rx_desc *desc, u32 idx, 796 struct gve_rx_cnts *cnts) 797 { 798 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 799 struct gve_rx_slot_page_info *page_info; 800 u16 frag_size = be16_to_cpu(desc->len); 801 struct gve_rx_ctx *ctx = &rx->ctx; 802 union gve_rx_data_slot *data_slot; 803 struct gve_priv *priv = rx->gve; 804 struct sk_buff *skb = NULL; 805 struct bpf_prog *xprog; 806 struct xdp_buff xdp; 807 dma_addr_t page_bus; 808 void *va; 809 810 u16 len = frag_size; 811 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 812 bool is_first_frag = ctx->frag_cnt == 0; 813 814 bool is_only_frag = is_first_frag && is_last_frag; 815 816 if (unlikely(ctx->drop_pkt)) 817 goto finish_frag; 818 819 if (desc->flags_seq & GVE_RXF_ERR) { 820 ctx->drop_pkt = true; 821 cnts->desc_err_pkt_cnt++; 822 napi_free_frags(napi); 823 goto finish_frag; 824 } 825 826 if (unlikely(frag_size > rx->packet_buffer_size)) { 827 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 828 frag_size, rx->packet_buffer_size); 829 ctx->drop_pkt = true; 830 napi_free_frags(napi); 831 gve_schedule_reset(rx->gve); 832 goto finish_frag; 833 } 834 835 /* Prefetch two packet buffers ahead, we will need it soon. */ 836 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 837 va = page_info->page_address + page_info->page_offset; 838 prefetch(page_info->page); /* Kernel page struct. */ 839 prefetch(va); /* Packet header. */ 840 prefetch(va + 64); /* Next cacheline too. */ 841 842 page_info = &rx->data.page_info[idx]; 843 data_slot = &rx->data.data_ring[idx]; 844 page_bus = (rx->data.raw_addressing) ? 845 be64_to_cpu(data_slot->addr) - page_info->page_offset : 846 rx->data.qpl->page_buses[idx]; 847 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 848 PAGE_SIZE, DMA_FROM_DEVICE); 849 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 850 len -= page_info->pad; 851 frag_size -= page_info->pad; 852 853 xprog = READ_ONCE(priv->xdp_prog); 854 if (xprog && is_only_frag) { 855 void *old_data; 856 int xdp_act; 857 858 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 859 xdp_prepare_buff(&xdp, page_info->page_address + 860 page_info->page_offset, GVE_RX_PAD, 861 len, false); 862 old_data = xdp.data; 863 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 864 if (xdp_act != XDP_PASS) { 865 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 866 ctx->total_size += frag_size; 867 goto finish_ok_pkt; 868 } 869 870 page_info->pad += xdp.data - old_data; 871 len = xdp.data_end - xdp.data; 872 873 u64_stats_update_begin(&rx->statss); 874 rx->xdp_actions[XDP_PASS]++; 875 u64_stats_update_end(&rx->statss); 876 } 877 878 skb = gve_rx_skb(priv, rx, page_info, napi, len, 879 data_slot, is_only_frag); 880 if (!skb) { 881 u64_stats_update_begin(&rx->statss); 882 rx->rx_skb_alloc_fail++; 883 u64_stats_update_end(&rx->statss); 884 885 napi_free_frags(napi); 886 ctx->drop_pkt = true; 887 goto finish_frag; 888 } 889 ctx->total_size += frag_size; 890 891 if (is_first_frag) { 892 if (likely(feat & NETIF_F_RXCSUM)) { 893 /* NIC passes up the partial sum */ 894 if (desc->csum) 895 skb->ip_summed = CHECKSUM_COMPLETE; 896 else 897 skb->ip_summed = CHECKSUM_NONE; 898 skb->csum = csum_unfold(desc->csum); 899 } 900 901 /* parse flags & pass relevant info up */ 902 if (likely(feat & NETIF_F_RXHASH) && 903 gve_needs_rss(desc->flags_seq)) 904 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 905 gve_rss_type(desc->flags_seq)); 906 } 907 908 if (is_last_frag) { 909 skb_record_rx_queue(skb, rx->q_num); 910 if (skb_is_nonlinear(skb)) 911 napi_gro_frags(napi); 912 else 913 napi_gro_receive(napi, skb); 914 goto finish_ok_pkt; 915 } 916 917 goto finish_frag; 918 919 finish_ok_pkt: 920 cnts->ok_pkt_bytes += ctx->total_size; 921 cnts->ok_pkt_cnt++; 922 finish_frag: 923 ctx->frag_cnt++; 924 if (is_last_frag) { 925 cnts->total_pkt_cnt++; 926 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 927 gve_rx_ctx_clear(ctx); 928 } 929 } 930 931 bool gve_rx_work_pending(struct gve_rx_ring *rx) 932 { 933 struct gve_rx_desc *desc; 934 __be16 flags_seq; 935 u32 next_idx; 936 937 next_idx = rx->cnt & rx->mask; 938 desc = rx->desc.desc_ring + next_idx; 939 940 flags_seq = desc->flags_seq; 941 942 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 943 } 944 945 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 946 { 947 int refill_target = rx->mask + 1; 948 u32 fill_cnt = rx->fill_cnt; 949 950 while (fill_cnt - rx->cnt < refill_target) { 951 struct gve_rx_slot_page_info *page_info; 952 u32 idx = fill_cnt & rx->mask; 953 954 page_info = &rx->data.page_info[idx]; 955 if (page_info->can_flip) { 956 /* The other half of the page is free because it was 957 * free when we processed the descriptor. Flip to it. 958 */ 959 union gve_rx_data_slot *data_slot = 960 &rx->data.data_ring[idx]; 961 962 gve_rx_flip_buff(page_info, &data_slot->addr); 963 page_info->can_flip = 0; 964 } else { 965 /* It is possible that the networking stack has already 966 * finished processing all outstanding packets in the buffer 967 * and it can be reused. 968 * Flipping is unnecessary here - if the networking stack still 969 * owns half the page it is impossible to tell which half. Either 970 * the whole page is free or it needs to be replaced. 971 */ 972 int recycle = gve_rx_can_recycle_buffer(page_info); 973 974 if (recycle < 0) { 975 if (!rx->data.raw_addressing) 976 gve_schedule_reset(priv); 977 return false; 978 } 979 if (!recycle) { 980 /* We can't reuse the buffer - alloc a new one*/ 981 union gve_rx_data_slot *data_slot = 982 &rx->data.data_ring[idx]; 983 struct device *dev = &priv->pdev->dev; 984 gve_rx_free_buffer(dev, page_info, data_slot); 985 page_info->page = NULL; 986 if (gve_rx_alloc_buffer(priv, dev, page_info, 987 data_slot, rx)) { 988 break; 989 } 990 } 991 } 992 fill_cnt++; 993 } 994 rx->fill_cnt = fill_cnt; 995 return true; 996 } 997 998 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 999 netdev_features_t feat) 1000 { 1001 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 1002 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 1003 struct gve_rx_ctx *ctx = &rx->ctx; 1004 struct gve_priv *priv = rx->gve; 1005 struct gve_rx_cnts cnts = {0}; 1006 struct gve_rx_desc *next_desc; 1007 u32 idx = rx->cnt & rx->mask; 1008 u32 work_done = 0; 1009 1010 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 1011 1012 // Exceed budget only if (and till) the inflight packet is consumed. 1013 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 1014 (work_done < budget || ctx->frag_cnt)) { 1015 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 1016 prefetch(next_desc); 1017 1018 gve_rx(rx, feat, desc, idx, &cnts); 1019 1020 rx->cnt++; 1021 idx = rx->cnt & rx->mask; 1022 desc = &rx->desc.desc_ring[idx]; 1023 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 1024 work_done++; 1025 } 1026 1027 // The device will only send whole packets. 1028 if (unlikely(ctx->frag_cnt)) { 1029 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 1030 1031 napi_free_frags(napi); 1032 gve_rx_ctx_clear(&rx->ctx); 1033 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 1034 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 1035 gve_schedule_reset(rx->gve); 1036 } 1037 1038 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 1039 return 0; 1040 1041 if (work_done) { 1042 u64_stats_update_begin(&rx->statss); 1043 rx->rpackets += cnts.ok_pkt_cnt; 1044 rx->rbytes += cnts.ok_pkt_bytes; 1045 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 1046 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 1047 u64_stats_update_end(&rx->statss); 1048 } 1049 1050 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1051 gve_xdp_tx_flush(priv, rx->q_num); 1052 1053 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1054 xdp_do_flush(); 1055 1056 /* restock ring slots */ 1057 if (!rx->data.raw_addressing) { 1058 /* In QPL mode buffs are refilled as the desc are processed */ 1059 rx->fill_cnt += work_done; 1060 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1061 /* In raw addressing mode buffs are only refilled if the avail 1062 * falls below a threshold. 1063 */ 1064 if (!gve_rx_refill_buffers(priv, rx)) 1065 return 0; 1066 1067 /* If we were not able to completely refill buffers, we'll want 1068 * to schedule this queue for work again to refill buffers. 1069 */ 1070 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 1071 gve_rx_write_doorbell(priv, rx); 1072 return budget; 1073 } 1074 } 1075 1076 gve_rx_write_doorbell(priv, rx); 1077 return cnts.total_pkt_cnt; 1078 } 1079 1080 int gve_rx_poll(struct gve_notify_block *block, int budget) 1081 { 1082 struct gve_rx_ring *rx = block->rx; 1083 netdev_features_t feat; 1084 int work_done = 0; 1085 1086 feat = block->napi.dev->features; 1087 1088 if (budget > 0) 1089 work_done = gve_clean_rx_done(rx, budget, feat); 1090 1091 return work_done; 1092 } 1093