1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 12 static void gve_rx_free_buffer(struct device *dev, 13 struct gve_rx_slot_page_info *page_info, 14 union gve_rx_data_slot *data_slot) 15 { 16 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 17 GVE_DATA_SLOT_ADDR_PAGE_MASK); 18 19 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 20 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 21 } 22 23 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) 24 { 25 u32 slots = rx->mask + 1; 26 int i; 27 28 if (rx->data.raw_addressing) { 29 for (i = 0; i < slots; i++) 30 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 31 &rx->data.data_ring[i]); 32 } else { 33 for (i = 0; i < slots; i++) 34 page_ref_sub(rx->data.page_info[i].page, 35 rx->data.page_info[i].pagecnt_bias - 1); 36 gve_unassign_qpl(priv, rx->data.qpl->id); 37 rx->data.qpl = NULL; 38 } 39 kvfree(rx->data.page_info); 40 rx->data.page_info = NULL; 41 } 42 43 static void gve_rx_free_ring(struct gve_priv *priv, int idx) 44 { 45 struct gve_rx_ring *rx = &priv->rx[idx]; 46 struct device *dev = &priv->pdev->dev; 47 u32 slots = rx->mask + 1; 48 size_t bytes; 49 50 gve_rx_remove_from_block(priv, idx); 51 52 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 53 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 54 rx->desc.desc_ring = NULL; 55 56 dma_free_coherent(dev, sizeof(*rx->q_resources), 57 rx->q_resources, rx->q_resources_bus); 58 rx->q_resources = NULL; 59 60 gve_rx_unfill_pages(priv, rx); 61 62 bytes = sizeof(*rx->data.data_ring) * slots; 63 dma_free_coherent(dev, bytes, rx->data.data_ring, 64 rx->data.data_bus); 65 rx->data.data_ring = NULL; 66 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 67 } 68 69 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 70 dma_addr_t addr, struct page *page, __be64 *slot_addr) 71 { 72 page_info->page = page; 73 page_info->page_offset = 0; 74 page_info->page_address = page_address(page); 75 *slot_addr = cpu_to_be64(addr); 76 /* The page already has 1 ref */ 77 page_ref_add(page, INT_MAX - 1); 78 page_info->pagecnt_bias = INT_MAX; 79 } 80 81 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 82 struct gve_rx_slot_page_info *page_info, 83 union gve_rx_data_slot *data_slot) 84 { 85 struct page *page; 86 dma_addr_t dma; 87 int err; 88 89 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE); 90 if (err) 91 return err; 92 93 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 94 return 0; 95 } 96 97 static int gve_prefill_rx_pages(struct gve_rx_ring *rx) 98 { 99 struct gve_priv *priv = rx->gve; 100 u32 slots; 101 int err; 102 int i; 103 104 /* Allocate one page per Rx queue slot. Each page is split into two 105 * packet buffers, when possible we "page flip" between the two. 106 */ 107 slots = rx->mask + 1; 108 109 rx->data.page_info = kvzalloc(slots * 110 sizeof(*rx->data.page_info), GFP_KERNEL); 111 if (!rx->data.page_info) 112 return -ENOMEM; 113 114 if (!rx->data.raw_addressing) { 115 rx->data.qpl = gve_assign_rx_qpl(priv); 116 if (!rx->data.qpl) { 117 kvfree(rx->data.page_info); 118 rx->data.page_info = NULL; 119 return -ENOMEM; 120 } 121 } 122 for (i = 0; i < slots; i++) { 123 if (!rx->data.raw_addressing) { 124 struct page *page = rx->data.qpl->pages[i]; 125 dma_addr_t addr = i * PAGE_SIZE; 126 127 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 128 &rx->data.data_ring[i].qpl_offset); 129 continue; 130 } 131 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], 132 &rx->data.data_ring[i]); 133 if (err) 134 goto alloc_err; 135 } 136 137 return slots; 138 alloc_err: 139 while (i--) 140 gve_rx_free_buffer(&priv->pdev->dev, 141 &rx->data.page_info[i], 142 &rx->data.data_ring[i]); 143 return err; 144 } 145 146 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) 147 { 148 struct gve_rx_ring *rx = &priv->rx[idx]; 149 struct device *hdev = &priv->pdev->dev; 150 u32 slots, npages; 151 int filled_pages; 152 size_t bytes; 153 int err; 154 155 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 156 /* Make sure everything is zeroed to start with */ 157 memset(rx, 0, sizeof(*rx)); 158 159 rx->gve = priv; 160 rx->q_num = idx; 161 162 slots = priv->rx_data_slot_cnt; 163 rx->mask = slots - 1; 164 rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; 165 166 /* alloc rx data ring */ 167 bytes = sizeof(*rx->data.data_ring) * slots; 168 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 169 &rx->data.data_bus, 170 GFP_KERNEL); 171 if (!rx->data.data_ring) 172 return -ENOMEM; 173 filled_pages = gve_prefill_rx_pages(rx); 174 if (filled_pages < 0) { 175 err = -ENOMEM; 176 goto abort_with_slots; 177 } 178 rx->fill_cnt = filled_pages; 179 /* Ensure data ring slots (packet buffers) are visible. */ 180 dma_wmb(); 181 182 /* Alloc gve_queue_resources */ 183 rx->q_resources = 184 dma_alloc_coherent(hdev, 185 sizeof(*rx->q_resources), 186 &rx->q_resources_bus, 187 GFP_KERNEL); 188 if (!rx->q_resources) { 189 err = -ENOMEM; 190 goto abort_filled; 191 } 192 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 193 (unsigned long)rx->data.data_bus); 194 195 /* alloc rx desc ring */ 196 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 197 npages = bytes / PAGE_SIZE; 198 if (npages * PAGE_SIZE != bytes) { 199 err = -EIO; 200 goto abort_with_q_resources; 201 } 202 203 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 204 GFP_KERNEL); 205 if (!rx->desc.desc_ring) { 206 err = -ENOMEM; 207 goto abort_with_q_resources; 208 } 209 rx->cnt = 0; 210 rx->db_threshold = priv->rx_desc_cnt / 2; 211 rx->desc.seqno = 1; 212 gve_rx_add_to_block(priv, idx); 213 214 return 0; 215 216 abort_with_q_resources: 217 dma_free_coherent(hdev, sizeof(*rx->q_resources), 218 rx->q_resources, rx->q_resources_bus); 219 rx->q_resources = NULL; 220 abort_filled: 221 gve_rx_unfill_pages(priv, rx); 222 abort_with_slots: 223 bytes = sizeof(*rx->data.data_ring) * slots; 224 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 225 rx->data.data_ring = NULL; 226 227 return err; 228 } 229 230 int gve_rx_alloc_rings(struct gve_priv *priv) 231 { 232 int err = 0; 233 int i; 234 235 for (i = 0; i < priv->rx_cfg.num_queues; i++) { 236 err = gve_rx_alloc_ring(priv, i); 237 if (err) { 238 netif_err(priv, drv, priv->dev, 239 "Failed to alloc rx ring=%d: err=%d\n", 240 i, err); 241 break; 242 } 243 } 244 /* Unallocate if there was an error */ 245 if (err) { 246 int j; 247 248 for (j = 0; j < i; j++) 249 gve_rx_free_ring(priv, j); 250 } 251 return err; 252 } 253 254 void gve_rx_free_rings_gqi(struct gve_priv *priv) 255 { 256 int i; 257 258 for (i = 0; i < priv->rx_cfg.num_queues; i++) 259 gve_rx_free_ring(priv, i); 260 } 261 262 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 263 { 264 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 265 266 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 267 } 268 269 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 270 { 271 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 272 return PKT_HASH_TYPE_L4; 273 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 274 return PKT_HASH_TYPE_L3; 275 return PKT_HASH_TYPE_L2; 276 } 277 278 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 279 struct gve_rx_slot_page_info *page_info, 280 u16 len) 281 { 282 struct sk_buff *skb = napi_get_frags(napi); 283 284 if (unlikely(!skb)) 285 return NULL; 286 287 skb_add_rx_frag(skb, 0, page_info->page, 288 page_info->page_offset + 289 GVE_RX_PAD, len, PAGE_SIZE / 2); 290 291 return skb; 292 } 293 294 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 295 { 296 const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); 297 298 /* "flip" to other packet buffer on this page */ 299 page_info->page_offset ^= PAGE_SIZE / 2; 300 *(slot_addr) ^= offset; 301 } 302 303 static bool gve_rx_can_flip_buffers(struct net_device *netdev) 304 { 305 return PAGE_SIZE >= 4096 306 ? netdev->mtu + GVE_RX_PAD + ETH_HLEN <= PAGE_SIZE / 2 : false; 307 } 308 309 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 310 { 311 int pagecount = page_count(page_info->page); 312 313 /* This page is not being used by any SKBs - reuse */ 314 if (pagecount == page_info->pagecnt_bias) 315 return 1; 316 /* This page is still being used by an SKB - we can't reuse */ 317 else if (pagecount > page_info->pagecnt_bias) 318 return 0; 319 WARN(pagecount < page_info->pagecnt_bias, 320 "Pagecount should never be less than the bias."); 321 return -1; 322 } 323 324 static struct sk_buff * 325 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 326 struct gve_rx_slot_page_info *page_info, u16 len, 327 struct napi_struct *napi, 328 union gve_rx_data_slot *data_slot) 329 { 330 struct sk_buff *skb; 331 332 skb = gve_rx_add_frags(napi, page_info, len); 333 if (!skb) 334 return NULL; 335 336 /* Optimistically stop the kernel from freeing the page. 337 * We will check again in refill to determine if we need to alloc a 338 * new page. 339 */ 340 gve_dec_pagecnt_bias(page_info); 341 342 return skb; 343 } 344 345 static struct sk_buff * 346 gve_rx_qpl(struct device *dev, struct net_device *netdev, 347 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 348 u16 len, struct napi_struct *napi, 349 union gve_rx_data_slot *data_slot) 350 { 351 struct sk_buff *skb; 352 353 /* if raw_addressing mode is not enabled gvnic can only receive into 354 * registered segments. If the buffer can't be recycled, our only 355 * choice is to copy the data out of it so that we can return it to the 356 * device. 357 */ 358 if (page_info->can_flip) { 359 skb = gve_rx_add_frags(napi, page_info, len); 360 /* No point in recycling if we didn't get the skb */ 361 if (skb) { 362 /* Make sure that the page isn't freed. */ 363 gve_dec_pagecnt_bias(page_info); 364 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 365 } 366 } else { 367 skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD); 368 if (skb) { 369 u64_stats_update_begin(&rx->statss); 370 rx->rx_copied_pkt++; 371 u64_stats_update_end(&rx->statss); 372 } 373 } 374 return skb; 375 } 376 377 static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, 378 netdev_features_t feat, u32 idx) 379 { 380 struct gve_rx_slot_page_info *page_info; 381 struct gve_priv *priv = rx->gve; 382 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 383 struct net_device *dev = priv->dev; 384 union gve_rx_data_slot *data_slot; 385 struct sk_buff *skb = NULL; 386 dma_addr_t page_bus; 387 void *va; 388 u16 len; 389 390 /* Prefetch two packet pages ahead, we will need it soon. */ 391 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 392 va = page_info->page_address + GVE_RX_PAD + 393 page_info->page_offset; 394 395 prefetch(page_info->page); /* Kernel page struct. */ 396 prefetch(va); /* Packet header. */ 397 prefetch(va + 64); /* Next cacheline too. */ 398 399 /* drop this packet */ 400 if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) { 401 u64_stats_update_begin(&rx->statss); 402 rx->rx_desc_err_dropped_pkt++; 403 u64_stats_update_end(&rx->statss); 404 return false; 405 } 406 407 len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD; 408 page_info = &rx->data.page_info[idx]; 409 410 data_slot = &rx->data.data_ring[idx]; 411 page_bus = (rx->data.raw_addressing) ? 412 be64_to_cpu(data_slot->addr) & GVE_DATA_SLOT_ADDR_PAGE_MASK : 413 rx->data.qpl->page_buses[idx]; 414 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 415 PAGE_SIZE, DMA_FROM_DEVICE); 416 417 if (len <= priv->rx_copybreak) { 418 /* Just copy small packets */ 419 skb = gve_rx_copy(dev, napi, page_info, len, GVE_RX_PAD); 420 u64_stats_update_begin(&rx->statss); 421 rx->rx_copied_pkt++; 422 rx->rx_copybreak_pkt++; 423 u64_stats_update_end(&rx->statss); 424 } else { 425 u8 can_flip = gve_rx_can_flip_buffers(dev); 426 int recycle = 0; 427 428 if (can_flip) { 429 recycle = gve_rx_can_recycle_buffer(page_info); 430 if (recycle < 0) { 431 if (!rx->data.raw_addressing) 432 gve_schedule_reset(priv); 433 return false; 434 } 435 } 436 437 page_info->can_flip = can_flip && recycle; 438 if (rx->data.raw_addressing) { 439 skb = gve_rx_raw_addressing(&priv->pdev->dev, dev, 440 page_info, len, napi, 441 data_slot); 442 } else { 443 skb = gve_rx_qpl(&priv->pdev->dev, dev, rx, 444 page_info, len, napi, data_slot); 445 } 446 } 447 448 if (!skb) { 449 u64_stats_update_begin(&rx->statss); 450 rx->rx_skb_alloc_fail++; 451 u64_stats_update_end(&rx->statss); 452 return false; 453 } 454 455 if (likely(feat & NETIF_F_RXCSUM)) { 456 /* NIC passes up the partial sum */ 457 if (rx_desc->csum) 458 skb->ip_summed = CHECKSUM_COMPLETE; 459 else 460 skb->ip_summed = CHECKSUM_NONE; 461 skb->csum = csum_unfold(rx_desc->csum); 462 } 463 464 /* parse flags & pass relevant info up */ 465 if (likely(feat & NETIF_F_RXHASH) && 466 gve_needs_rss(rx_desc->flags_seq)) 467 skb_set_hash(skb, be32_to_cpu(rx_desc->rss_hash), 468 gve_rss_type(rx_desc->flags_seq)); 469 470 if (skb_is_nonlinear(skb)) 471 napi_gro_frags(napi); 472 else 473 napi_gro_receive(napi, skb); 474 return true; 475 } 476 477 bool gve_rx_work_pending(struct gve_rx_ring *rx) 478 { 479 struct gve_rx_desc *desc; 480 __be16 flags_seq; 481 u32 next_idx; 482 483 next_idx = rx->cnt & rx->mask; 484 desc = rx->desc.desc_ring + next_idx; 485 486 flags_seq = desc->flags_seq; 487 /* Make sure we have synchronized the seq no with the device */ 488 smp_rmb(); 489 490 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 491 } 492 493 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 494 { 495 int refill_target = rx->mask + 1; 496 u32 fill_cnt = rx->fill_cnt; 497 498 while (fill_cnt - rx->cnt < refill_target) { 499 struct gve_rx_slot_page_info *page_info; 500 u32 idx = fill_cnt & rx->mask; 501 502 page_info = &rx->data.page_info[idx]; 503 if (page_info->can_flip) { 504 /* The other half of the page is free because it was 505 * free when we processed the descriptor. Flip to it. 506 */ 507 union gve_rx_data_slot *data_slot = 508 &rx->data.data_ring[idx]; 509 510 gve_rx_flip_buff(page_info, &data_slot->addr); 511 page_info->can_flip = 0; 512 } else { 513 /* It is possible that the networking stack has already 514 * finished processing all outstanding packets in the buffer 515 * and it can be reused. 516 * Flipping is unnecessary here - if the networking stack still 517 * owns half the page it is impossible to tell which half. Either 518 * the whole page is free or it needs to be replaced. 519 */ 520 int recycle = gve_rx_can_recycle_buffer(page_info); 521 522 if (recycle < 0) { 523 if (!rx->data.raw_addressing) 524 gve_schedule_reset(priv); 525 return false; 526 } 527 if (!recycle) { 528 /* We can't reuse the buffer - alloc a new one*/ 529 union gve_rx_data_slot *data_slot = 530 &rx->data.data_ring[idx]; 531 struct device *dev = &priv->pdev->dev; 532 533 gve_rx_free_buffer(dev, page_info, data_slot); 534 page_info->page = NULL; 535 if (gve_rx_alloc_buffer(priv, dev, page_info, 536 data_slot)) { 537 u64_stats_update_begin(&rx->statss); 538 rx->rx_buf_alloc_fail++; 539 u64_stats_update_end(&rx->statss); 540 break; 541 } 542 } 543 } 544 fill_cnt++; 545 } 546 rx->fill_cnt = fill_cnt; 547 return true; 548 } 549 550 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 551 netdev_features_t feat) 552 { 553 struct gve_priv *priv = rx->gve; 554 u32 work_done = 0, packets = 0; 555 struct gve_rx_desc *desc; 556 u32 cnt = rx->cnt; 557 u32 idx = cnt & rx->mask; 558 u64 bytes = 0; 559 560 desc = rx->desc.desc_ring + idx; 561 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 562 work_done < budget) { 563 bool dropped; 564 565 netif_info(priv, rx_status, priv->dev, 566 "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n", 567 rx->q_num, idx, desc, desc->flags_seq); 568 netif_info(priv, rx_status, priv->dev, 569 "[%d] seqno=%d rx->desc.seqno=%d\n", 570 rx->q_num, GVE_SEQNO(desc->flags_seq), 571 rx->desc.seqno); 572 573 /* prefetch two descriptors ahead */ 574 prefetch(rx->desc.desc_ring + ((cnt + 2) & rx->mask)); 575 576 dropped = !gve_rx(rx, desc, feat, idx); 577 if (!dropped) { 578 bytes += be16_to_cpu(desc->len) - GVE_RX_PAD; 579 packets++; 580 } 581 cnt++; 582 idx = cnt & rx->mask; 583 desc = rx->desc.desc_ring + idx; 584 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 585 work_done++; 586 } 587 588 if (!work_done && rx->fill_cnt - cnt > rx->db_threshold) 589 return 0; 590 591 if (work_done) { 592 u64_stats_update_begin(&rx->statss); 593 rx->rpackets += packets; 594 rx->rbytes += bytes; 595 u64_stats_update_end(&rx->statss); 596 rx->cnt = cnt; 597 } 598 599 /* restock ring slots */ 600 if (!rx->data.raw_addressing) { 601 /* In QPL mode buffs are refilled as the desc are processed */ 602 rx->fill_cnt += work_done; 603 } else if (rx->fill_cnt - cnt <= rx->db_threshold) { 604 /* In raw addressing mode buffs are only refilled if the avail 605 * falls below a threshold. 606 */ 607 if (!gve_rx_refill_buffers(priv, rx)) 608 return 0; 609 610 /* If we were not able to completely refill buffers, we'll want 611 * to schedule this queue for work again to refill buffers. 612 */ 613 if (rx->fill_cnt - cnt <= rx->db_threshold) { 614 gve_rx_write_doorbell(priv, rx); 615 return budget; 616 } 617 } 618 619 gve_rx_write_doorbell(priv, rx); 620 return work_done; 621 } 622 623 int gve_rx_poll(struct gve_notify_block *block, int budget) 624 { 625 struct gve_rx_ring *rx = block->rx; 626 netdev_features_t feat; 627 int work_done = 0; 628 629 feat = block->napi.dev->features; 630 631 /* If budget is 0, do all the work */ 632 if (budget == 0) 633 budget = INT_MAX; 634 635 if (budget > 0) 636 work_done = gve_clean_rx_done(rx, budget, feat); 637 638 return work_done; 639 } 640