1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <net/xdp.h>
13 #include <net/xdp_sock_drv.h>
14
gve_rx_free_buffer(struct device * dev,struct gve_rx_slot_page_info * page_info,union gve_rx_data_slot * data_slot)15 static void gve_rx_free_buffer(struct device *dev,
16 struct gve_rx_slot_page_info *page_info,
17 union gve_rx_data_slot *data_slot)
18 {
19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
20 GVE_DATA_SLOT_ADDR_PAGE_MASK);
21
22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
24 }
25
gve_rx_unfill_pages(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)26 static void gve_rx_unfill_pages(struct gve_priv *priv,
27 struct gve_rx_ring *rx,
28 struct gve_rx_alloc_rings_cfg *cfg)
29 {
30 u32 slots = rx->mask + 1;
31 int i;
32
33 if (!rx->data.page_info)
34 return;
35
36 if (rx->data.raw_addressing) {
37 for (i = 0; i < slots; i++)
38 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
39 &rx->data.data_ring[i]);
40 } else {
41 for (i = 0; i < slots; i++)
42 page_ref_sub(rx->data.page_info[i].page,
43 rx->data.page_info[i].pagecnt_bias - 1);
44
45 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
46 page_ref_sub(rx->qpl_copy_pool[i].page,
47 rx->qpl_copy_pool[i].pagecnt_bias - 1);
48 put_page(rx->qpl_copy_pool[i].page);
49 }
50 }
51 kvfree(rx->data.page_info);
52 rx->data.page_info = NULL;
53 }
54
gve_rx_ctx_clear(struct gve_rx_ctx * ctx)55 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
56 {
57 ctx->skb_head = NULL;
58 ctx->skb_tail = NULL;
59 ctx->total_size = 0;
60 ctx->frag_cnt = 0;
61 ctx->drop_pkt = false;
62 }
63
gve_rx_init_ring_state_gqi(struct gve_rx_ring * rx)64 static void gve_rx_init_ring_state_gqi(struct gve_rx_ring *rx)
65 {
66 rx->desc.seqno = 1;
67 rx->cnt = 0;
68 gve_rx_ctx_clear(&rx->ctx);
69 }
70
gve_rx_reset_ring_gqi(struct gve_priv * priv,int idx)71 static void gve_rx_reset_ring_gqi(struct gve_priv *priv, int idx)
72 {
73 struct gve_rx_ring *rx = &priv->rx[idx];
74 const u32 slots = priv->rx_desc_cnt;
75 size_t size;
76
77 /* Reset desc ring */
78 if (rx->desc.desc_ring) {
79 size = slots * sizeof(rx->desc.desc_ring[0]);
80 memset(rx->desc.desc_ring, 0, size);
81 }
82
83 /* Reset q_resources */
84 if (rx->q_resources)
85 memset(rx->q_resources, 0, sizeof(*rx->q_resources));
86
87 gve_rx_init_ring_state_gqi(rx);
88 }
89
gve_rx_stop_ring_gqi(struct gve_priv * priv,int idx)90 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx)
91 {
92 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
93
94 if (!gve_rx_was_added_to_block(priv, idx))
95 return;
96
97 gve_remove_napi(priv, ntfy_idx);
98 gve_rx_remove_from_block(priv, idx);
99 gve_rx_reset_ring_gqi(priv, idx);
100 }
101
gve_rx_free_ring_gqi(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)102 void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx,
103 struct gve_rx_alloc_rings_cfg *cfg)
104 {
105 struct device *dev = &priv->pdev->dev;
106 u32 slots = rx->mask + 1;
107 int idx = rx->q_num;
108 size_t bytes;
109 u32 qpl_id;
110
111 if (rx->desc.desc_ring) {
112 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size;
113 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
114 rx->desc.desc_ring = NULL;
115 }
116
117 if (rx->q_resources) {
118 dma_free_coherent(dev, sizeof(*rx->q_resources),
119 rx->q_resources, rx->q_resources_bus);
120 rx->q_resources = NULL;
121 }
122
123 gve_rx_unfill_pages(priv, rx, cfg);
124
125 if (rx->data.data_ring) {
126 bytes = sizeof(*rx->data.data_ring) * slots;
127 dma_free_coherent(dev, bytes, rx->data.data_ring,
128 rx->data.data_bus);
129 rx->data.data_ring = NULL;
130 }
131
132 kvfree(rx->qpl_copy_pool);
133 rx->qpl_copy_pool = NULL;
134
135 if (rx->data.qpl) {
136 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, idx);
137 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id);
138 rx->data.qpl = NULL;
139 }
140
141 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
142 }
143
gve_setup_rx_buffer(struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,dma_addr_t addr,struct page * page,__be64 * slot_addr)144 static void gve_setup_rx_buffer(struct gve_rx_ring *rx,
145 struct gve_rx_slot_page_info *page_info,
146 dma_addr_t addr, struct page *page,
147 __be64 *slot_addr)
148 {
149 page_info->page = page;
150 page_info->page_offset = 0;
151 page_info->page_address = page_address(page);
152 page_info->buf_size = rx->packet_buffer_size;
153 *slot_addr = cpu_to_be64(addr);
154 /* The page already has 1 ref */
155 page_ref_add(page, INT_MAX - 1);
156 page_info->pagecnt_bias = INT_MAX;
157 }
158
gve_rx_alloc_buffer(struct gve_priv * priv,struct device * dev,struct gve_rx_slot_page_info * page_info,union gve_rx_data_slot * data_slot,struct gve_rx_ring * rx)159 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
160 struct gve_rx_slot_page_info *page_info,
161 union gve_rx_data_slot *data_slot,
162 struct gve_rx_ring *rx)
163 {
164 struct page *page;
165 dma_addr_t dma;
166 int err;
167
168 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
169 GFP_ATOMIC);
170 if (err) {
171 u64_stats_update_begin(&rx->statss);
172 rx->rx_buf_alloc_fail++;
173 u64_stats_update_end(&rx->statss);
174 return err;
175 }
176
177 gve_setup_rx_buffer(rx, page_info, dma, page, &data_slot->addr);
178 return 0;
179 }
180
gve_rx_prefill_pages(struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)181 static int gve_rx_prefill_pages(struct gve_rx_ring *rx,
182 struct gve_rx_alloc_rings_cfg *cfg)
183 {
184 struct gve_priv *priv = rx->gve;
185 u32 slots;
186 int err;
187 int i;
188 int j;
189
190 /* Allocate one page per Rx queue slot. Each page is split into two
191 * packet buffers, when possible we "page flip" between the two.
192 */
193 slots = rx->mask + 1;
194
195 rx->data.page_info = kvzalloc(slots *
196 sizeof(*rx->data.page_info), GFP_KERNEL);
197 if (!rx->data.page_info)
198 return -ENOMEM;
199
200 for (i = 0; i < slots; i++) {
201 if (!rx->data.raw_addressing) {
202 struct page *page = rx->data.qpl->pages[i];
203 dma_addr_t addr = i * PAGE_SIZE;
204
205 gve_setup_rx_buffer(rx, &rx->data.page_info[i], addr,
206 page,
207 &rx->data.data_ring[i].qpl_offset);
208 continue;
209 }
210 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev,
211 &rx->data.page_info[i],
212 &rx->data.data_ring[i], rx);
213 if (err)
214 goto alloc_err_rda;
215 }
216
217 if (!rx->data.raw_addressing) {
218 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
219 struct page *page = alloc_page(GFP_KERNEL);
220
221 if (!page) {
222 err = -ENOMEM;
223 goto alloc_err_qpl;
224 }
225
226 rx->qpl_copy_pool[j].page = page;
227 rx->qpl_copy_pool[j].page_offset = 0;
228 rx->qpl_copy_pool[j].page_address = page_address(page);
229 rx->qpl_copy_pool[j].buf_size = rx->packet_buffer_size;
230
231 /* The page already has 1 ref. */
232 page_ref_add(page, INT_MAX - 1);
233 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
234 }
235 }
236
237 return slots;
238
239 alloc_err_qpl:
240 /* Fully free the copy pool pages. */
241 while (j--) {
242 page_ref_sub(rx->qpl_copy_pool[j].page,
243 rx->qpl_copy_pool[j].pagecnt_bias - 1);
244 put_page(rx->qpl_copy_pool[j].page);
245 }
246
247 /* Do not fully free QPL pages - only remove the bias added in this
248 * function with gve_setup_rx_buffer.
249 */
250 while (i--)
251 page_ref_sub(rx->data.page_info[i].page,
252 rx->data.page_info[i].pagecnt_bias - 1);
253
254 return err;
255
256 alloc_err_rda:
257 while (i--)
258 gve_rx_free_buffer(&priv->pdev->dev,
259 &rx->data.page_info[i],
260 &rx->data.data_ring[i]);
261 return err;
262 }
263
gve_rx_start_ring_gqi(struct gve_priv * priv,int idx)264 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx)
265 {
266 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
267
268 gve_rx_add_to_block(priv, idx);
269 gve_add_napi(priv, ntfy_idx, gve_napi_poll);
270 }
271
gve_rx_alloc_ring_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg,struct gve_rx_ring * rx,int idx)272 int gve_rx_alloc_ring_gqi(struct gve_priv *priv,
273 struct gve_rx_alloc_rings_cfg *cfg,
274 struct gve_rx_ring *rx,
275 int idx)
276 {
277 struct device *hdev = &priv->pdev->dev;
278 u32 slots = cfg->ring_size;
279 int filled_pages;
280 int qpl_page_cnt;
281 u32 qpl_id = 0;
282 size_t bytes;
283 int err;
284
285 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
286 /* Make sure everything is zeroed to start with */
287 memset(rx, 0, sizeof(*rx));
288
289 rx->gve = priv;
290 rx->q_num = idx;
291 rx->packet_buffer_size = cfg->packet_buffer_size;
292
293 rx->mask = slots - 1;
294 rx->data.raw_addressing = cfg->raw_addressing;
295
296 /* alloc rx data ring */
297 bytes = sizeof(*rx->data.data_ring) * slots;
298 rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
299 &rx->data.data_bus,
300 GFP_KERNEL);
301 if (!rx->data.data_ring)
302 return -ENOMEM;
303
304 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
305 rx->qpl_copy_pool_head = 0;
306 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
307 sizeof(rx->qpl_copy_pool[0]),
308 GFP_KERNEL);
309
310 if (!rx->qpl_copy_pool) {
311 err = -ENOMEM;
312 goto abort_with_slots;
313 }
314
315 if (!rx->data.raw_addressing) {
316 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
317 qpl_page_cnt = cfg->ring_size;
318
319 rx->data.qpl = gve_alloc_queue_page_list(priv, qpl_id,
320 qpl_page_cnt);
321 if (!rx->data.qpl) {
322 err = -ENOMEM;
323 goto abort_with_copy_pool;
324 }
325 }
326
327 filled_pages = gve_rx_prefill_pages(rx, cfg);
328 if (filled_pages < 0) {
329 err = -ENOMEM;
330 goto abort_with_qpl;
331 }
332 rx->fill_cnt = filled_pages;
333 /* Ensure data ring slots (packet buffers) are visible. */
334 dma_wmb();
335
336 /* Alloc gve_queue_resources */
337 rx->q_resources =
338 dma_alloc_coherent(hdev,
339 sizeof(*rx->q_resources),
340 &rx->q_resources_bus,
341 GFP_KERNEL);
342 if (!rx->q_resources) {
343 err = -ENOMEM;
344 goto abort_filled;
345 }
346 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
347 (unsigned long)rx->data.data_bus);
348
349 /* alloc rx desc ring */
350 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size;
351 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
352 GFP_KERNEL);
353 if (!rx->desc.desc_ring) {
354 err = -ENOMEM;
355 goto abort_with_q_resources;
356 }
357 rx->db_threshold = slots / 2;
358 gve_rx_init_ring_state_gqi(rx);
359
360 gve_rx_ctx_clear(&rx->ctx);
361
362 return 0;
363
364 abort_with_q_resources:
365 dma_free_coherent(hdev, sizeof(*rx->q_resources),
366 rx->q_resources, rx->q_resources_bus);
367 rx->q_resources = NULL;
368 abort_filled:
369 gve_rx_unfill_pages(priv, rx, cfg);
370 abort_with_qpl:
371 if (!rx->data.raw_addressing) {
372 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id);
373 rx->data.qpl = NULL;
374 }
375 abort_with_copy_pool:
376 kvfree(rx->qpl_copy_pool);
377 rx->qpl_copy_pool = NULL;
378 abort_with_slots:
379 bytes = sizeof(*rx->data.data_ring) * slots;
380 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
381 rx->data.data_ring = NULL;
382
383 return err;
384 }
385
gve_rx_alloc_rings_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)386 int gve_rx_alloc_rings_gqi(struct gve_priv *priv,
387 struct gve_rx_alloc_rings_cfg *cfg)
388 {
389 struct gve_rx_ring *rx;
390 int err = 0;
391 int i, j;
392
393 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
394 GFP_KERNEL);
395 if (!rx)
396 return -ENOMEM;
397
398 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
399 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i);
400 if (err) {
401 netif_err(priv, drv, priv->dev,
402 "Failed to alloc rx ring=%d: err=%d\n",
403 i, err);
404 goto cleanup;
405 }
406 }
407
408 cfg->rx = rx;
409 return 0;
410
411 cleanup:
412 for (j = 0; j < i; j++)
413 gve_rx_free_ring_gqi(priv, &rx[j], cfg);
414 kvfree(rx);
415 return err;
416 }
417
gve_rx_free_rings_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)418 void gve_rx_free_rings_gqi(struct gve_priv *priv,
419 struct gve_rx_alloc_rings_cfg *cfg)
420 {
421 struct gve_rx_ring *rx = cfg->rx;
422 int i;
423
424 if (!rx)
425 return;
426
427 for (i = 0; i < cfg->qcfg_rx->num_queues; i++)
428 gve_rx_free_ring_gqi(priv, &rx[i], cfg);
429
430 kvfree(rx);
431 cfg->rx = NULL;
432 }
433
gve_rx_write_doorbell(struct gve_priv * priv,struct gve_rx_ring * rx)434 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
435 {
436 u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
437
438 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
439 }
440
gve_rss_type(__be16 pkt_flags)441 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
442 {
443 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
444 return PKT_HASH_TYPE_L4;
445 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
446 return PKT_HASH_TYPE_L3;
447 return PKT_HASH_TYPE_L2;
448 }
449
gve_rx_add_frags(struct napi_struct * napi,struct gve_rx_slot_page_info * page_info,unsigned int truesize,u16 len,struct gve_rx_ctx * ctx)450 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
451 struct gve_rx_slot_page_info *page_info,
452 unsigned int truesize, u16 len,
453 struct gve_rx_ctx *ctx)
454 {
455 u32 offset = page_info->page_offset + page_info->pad;
456 struct sk_buff *skb = ctx->skb_tail;
457 int num_frags = 0;
458
459 if (!skb) {
460 skb = napi_get_frags(napi);
461 if (unlikely(!skb))
462 return NULL;
463
464 ctx->skb_head = skb;
465 ctx->skb_tail = skb;
466 } else {
467 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
468 if (num_frags == MAX_SKB_FRAGS) {
469 skb = napi_alloc_skb(napi, 0);
470 if (!skb)
471 return NULL;
472
473 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k
474 // which is why we do not need to chain by using skb->next
475 skb_shinfo(ctx->skb_tail)->frag_list = skb;
476
477 ctx->skb_tail = skb;
478 num_frags = 0;
479 }
480 }
481
482 if (skb != ctx->skb_head) {
483 ctx->skb_head->len += len;
484 ctx->skb_head->data_len += len;
485 ctx->skb_head->truesize += truesize;
486 }
487 skb_add_rx_frag(skb, num_frags, page_info->page,
488 offset, len, truesize);
489
490 return ctx->skb_head;
491 }
492
gve_rx_flip_buff(struct gve_rx_slot_page_info * page_info,__be64 * slot_addr)493 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
494 {
495 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET);
496
497 /* "flip" to other packet buffer on this page */
498 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
499 *(slot_addr) ^= offset;
500 }
501
gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info * page_info)502 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
503 {
504 int pagecount = page_count(page_info->page);
505
506 /* This page is not being used by any SKBs - reuse */
507 if (pagecount == page_info->pagecnt_bias)
508 return 1;
509 /* This page is still being used by an SKB - we can't reuse */
510 else if (pagecount > page_info->pagecnt_bias)
511 return 0;
512 WARN(pagecount < page_info->pagecnt_bias,
513 "Pagecount should never be less than the bias.");
514 return -1;
515 }
516
517 static struct sk_buff *
gve_rx_raw_addressing(struct device * dev,struct net_device * netdev,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi,union gve_rx_data_slot * data_slot,u16 packet_buffer_size,struct gve_rx_ctx * ctx)518 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
519 struct gve_rx_slot_page_info *page_info, u16 len,
520 struct napi_struct *napi,
521 union gve_rx_data_slot *data_slot,
522 u16 packet_buffer_size, struct gve_rx_ctx *ctx)
523 {
524 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
525
526 if (!skb)
527 return NULL;
528
529 /* Optimistically stop the kernel from freeing the page.
530 * We will check again in refill to determine if we need to alloc a
531 * new page.
532 */
533 gve_dec_pagecnt_bias(page_info);
534
535 return skb;
536 }
537
gve_rx_copy_to_pool(struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi)538 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
539 struct gve_rx_slot_page_info *page_info,
540 u16 len, struct napi_struct *napi)
541 {
542 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
543 void *src = page_info->page_address + page_info->page_offset;
544 struct gve_rx_slot_page_info *copy_page_info;
545 struct gve_rx_ctx *ctx = &rx->ctx;
546 bool alloc_page = false;
547 struct sk_buff *skb;
548 void *dst;
549
550 copy_page_info = &rx->qpl_copy_pool[pool_idx];
551 if (!copy_page_info->can_flip) {
552 int recycle = gve_rx_can_recycle_buffer(copy_page_info);
553
554 if (unlikely(recycle < 0)) {
555 gve_schedule_reset(rx->gve);
556 return NULL;
557 }
558 alloc_page = !recycle;
559 }
560
561 if (alloc_page) {
562 struct gve_rx_slot_page_info alloc_page_info;
563 struct page *page;
564
565 /* The least recently used page turned out to be
566 * still in use by the kernel. Ignoring it and moving
567 * on alleviates head-of-line blocking.
568 */
569 rx->qpl_copy_pool_head++;
570
571 page = alloc_page(GFP_ATOMIC);
572 if (!page)
573 return NULL;
574
575 alloc_page_info.page = page;
576 alloc_page_info.page_offset = 0;
577 alloc_page_info.page_address = page_address(page);
578 alloc_page_info.pad = page_info->pad;
579
580 memcpy(alloc_page_info.page_address, src, page_info->pad + len);
581 skb = gve_rx_add_frags(napi, &alloc_page_info,
582 PAGE_SIZE,
583 len, ctx);
584
585 u64_stats_update_begin(&rx->statss);
586 rx->rx_frag_copy_cnt++;
587 rx->rx_frag_alloc_cnt++;
588 u64_stats_update_end(&rx->statss);
589
590 return skb;
591 }
592
593 dst = copy_page_info->page_address + copy_page_info->page_offset;
594 memcpy(dst, src, page_info->pad + len);
595 copy_page_info->pad = page_info->pad;
596
597 skb = gve_rx_add_frags(napi, copy_page_info,
598 copy_page_info->buf_size, len, ctx);
599 if (unlikely(!skb))
600 return NULL;
601
602 gve_dec_pagecnt_bias(copy_page_info);
603 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
604
605 if (copy_page_info->can_flip) {
606 /* We have used both halves of this copy page, it
607 * is time for it to go to the back of the queue.
608 */
609 copy_page_info->can_flip = false;
610 rx->qpl_copy_pool_head++;
611 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
612 } else {
613 copy_page_info->can_flip = true;
614 }
615
616 u64_stats_update_begin(&rx->statss);
617 rx->rx_frag_copy_cnt++;
618 u64_stats_update_end(&rx->statss);
619
620 return skb;
621 }
622
623 static struct sk_buff *
gve_rx_qpl(struct device * dev,struct net_device * netdev,struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi,union gve_rx_data_slot * data_slot)624 gve_rx_qpl(struct device *dev, struct net_device *netdev,
625 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
626 u16 len, struct napi_struct *napi,
627 union gve_rx_data_slot *data_slot)
628 {
629 struct gve_rx_ctx *ctx = &rx->ctx;
630 struct sk_buff *skb;
631
632 /* if raw_addressing mode is not enabled gvnic can only receive into
633 * registered segments. If the buffer can't be recycled, our only
634 * choice is to copy the data out of it so that we can return it to the
635 * device.
636 */
637 if (page_info->can_flip) {
638 skb = gve_rx_add_frags(napi, page_info, page_info->buf_size,
639 len, ctx);
640 /* No point in recycling if we didn't get the skb */
641 if (skb) {
642 /* Make sure that the page isn't freed. */
643 gve_dec_pagecnt_bias(page_info);
644 gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
645 }
646 } else {
647 skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
648 }
649 return skb;
650 }
651
gve_rx_skb(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,struct napi_struct * napi,u16 len,union gve_rx_data_slot * data_slot,bool is_only_frag)652 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
653 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
654 u16 len, union gve_rx_data_slot *data_slot,
655 bool is_only_frag)
656 {
657 struct net_device *netdev = priv->dev;
658 struct gve_rx_ctx *ctx = &rx->ctx;
659 struct sk_buff *skb = NULL;
660
661 if (len <= priv->rx_copybreak && is_only_frag) {
662 /* Just copy small packets */
663 skb = gve_rx_copy(netdev, napi, page_info, len);
664 if (skb) {
665 u64_stats_update_begin(&rx->statss);
666 rx->rx_copied_pkt++;
667 rx->rx_frag_copy_cnt++;
668 rx->rx_copybreak_pkt++;
669 u64_stats_update_end(&rx->statss);
670 }
671 } else {
672 int recycle = gve_rx_can_recycle_buffer(page_info);
673
674 if (unlikely(recycle < 0)) {
675 gve_schedule_reset(priv);
676 return NULL;
677 }
678 page_info->can_flip = recycle;
679 if (page_info->can_flip) {
680 u64_stats_update_begin(&rx->statss);
681 rx->rx_frag_flip_cnt++;
682 u64_stats_update_end(&rx->statss);
683 }
684
685 if (rx->data.raw_addressing) {
686 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
687 page_info, len, napi,
688 data_slot,
689 page_info->buf_size, ctx);
690 } else {
691 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
692 page_info, len, napi, data_slot);
693 }
694 }
695 return skb;
696 }
697
gve_xsk_pool_redirect(struct net_device * dev,struct gve_rx_ring * rx,void * data,int len,struct bpf_prog * xdp_prog)698 static int gve_xsk_pool_redirect(struct net_device *dev,
699 struct gve_rx_ring *rx,
700 void *data, int len,
701 struct bpf_prog *xdp_prog)
702 {
703 struct xdp_buff *xdp;
704 int err;
705
706 if (rx->xsk_pool->frame_len < len)
707 return -E2BIG;
708 xdp = xsk_buff_alloc(rx->xsk_pool);
709 if (!xdp) {
710 u64_stats_update_begin(&rx->statss);
711 rx->xdp_alloc_fails++;
712 u64_stats_update_end(&rx->statss);
713 return -ENOMEM;
714 }
715 xdp->data_end = xdp->data + len;
716 memcpy(xdp->data, data, len);
717 err = xdp_do_redirect(dev, xdp, xdp_prog);
718 if (err)
719 xsk_buff_free(xdp);
720 return err;
721 }
722
gve_xdp_redirect(struct net_device * dev,struct gve_rx_ring * rx,struct xdp_buff * orig,struct bpf_prog * xdp_prog)723 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
724 struct xdp_buff *orig, struct bpf_prog *xdp_prog)
725 {
726 int total_len, len = orig->data_end - orig->data;
727 int headroom = XDP_PACKET_HEADROOM;
728 struct xdp_buff new;
729 void *frame;
730 int err;
731
732 if (rx->xsk_pool)
733 return gve_xsk_pool_redirect(dev, rx, orig->data,
734 len, xdp_prog);
735
736 total_len = headroom + SKB_DATA_ALIGN(len) +
737 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
738 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
739 if (!frame) {
740 u64_stats_update_begin(&rx->statss);
741 rx->xdp_alloc_fails++;
742 u64_stats_update_end(&rx->statss);
743 return -ENOMEM;
744 }
745 xdp_init_buff(&new, total_len, &rx->xdp_rxq);
746 xdp_prepare_buff(&new, frame, headroom, len, false);
747 memcpy(new.data, orig->data, len);
748
749 err = xdp_do_redirect(dev, &new, xdp_prog);
750 if (err)
751 page_frag_free(frame);
752
753 return err;
754 }
755
gve_xdp_done(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act)756 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
757 struct xdp_buff *xdp, struct bpf_prog *xprog,
758 int xdp_act)
759 {
760 struct gve_tx_ring *tx;
761 int tx_qid;
762 int err;
763
764 switch (xdp_act) {
765 case XDP_ABORTED:
766 case XDP_DROP:
767 default:
768 break;
769 case XDP_TX:
770 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
771 tx = &priv->tx[tx_qid];
772 spin_lock(&tx->xdp_lock);
773 err = gve_xdp_xmit_one(priv, tx, xdp->data,
774 xdp->data_end - xdp->data, NULL);
775 spin_unlock(&tx->xdp_lock);
776
777 if (unlikely(err)) {
778 u64_stats_update_begin(&rx->statss);
779 rx->xdp_tx_errors++;
780 u64_stats_update_end(&rx->statss);
781 }
782 break;
783 case XDP_REDIRECT:
784 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
785
786 if (unlikely(err)) {
787 u64_stats_update_begin(&rx->statss);
788 rx->xdp_redirect_errors++;
789 u64_stats_update_end(&rx->statss);
790 }
791 break;
792 }
793 u64_stats_update_begin(&rx->statss);
794 if ((u32)xdp_act < GVE_XDP_ACTIONS)
795 rx->xdp_actions[xdp_act]++;
796 u64_stats_update_end(&rx->statss);
797 }
798
799 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
gve_rx(struct gve_rx_ring * rx,netdev_features_t feat,struct gve_rx_desc * desc,u32 idx,struct gve_rx_cnts * cnts)800 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
801 struct gve_rx_desc *desc, u32 idx,
802 struct gve_rx_cnts *cnts)
803 {
804 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
805 struct gve_rx_slot_page_info *page_info;
806 u16 frag_size = be16_to_cpu(desc->len);
807 struct gve_rx_ctx *ctx = &rx->ctx;
808 union gve_rx_data_slot *data_slot;
809 struct gve_priv *priv = rx->gve;
810 struct sk_buff *skb = NULL;
811 struct bpf_prog *xprog;
812 struct xdp_buff xdp;
813 dma_addr_t page_bus;
814 void *va;
815
816 u16 len = frag_size;
817 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
818 bool is_first_frag = ctx->frag_cnt == 0;
819
820 bool is_only_frag = is_first_frag && is_last_frag;
821
822 if (unlikely(ctx->drop_pkt))
823 goto finish_frag;
824
825 if (desc->flags_seq & GVE_RXF_ERR) {
826 ctx->drop_pkt = true;
827 cnts->desc_err_pkt_cnt++;
828 napi_free_frags(napi);
829 goto finish_frag;
830 }
831
832 if (unlikely(frag_size > rx->packet_buffer_size)) {
833 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
834 frag_size, rx->packet_buffer_size);
835 ctx->drop_pkt = true;
836 napi_free_frags(napi);
837 gve_schedule_reset(rx->gve);
838 goto finish_frag;
839 }
840
841 /* Prefetch two packet buffers ahead, we will need it soon. */
842 page_info = &rx->data.page_info[(idx + 2) & rx->mask];
843 va = page_info->page_address + page_info->page_offset;
844 prefetch(page_info->page); /* Kernel page struct. */
845 prefetch(va); /* Packet header. */
846 prefetch(va + 64); /* Next cacheline too. */
847
848 page_info = &rx->data.page_info[idx];
849 data_slot = &rx->data.data_ring[idx];
850 page_bus = (rx->data.raw_addressing) ?
851 be64_to_cpu(data_slot->addr) - page_info->page_offset :
852 rx->data.qpl->page_buses[idx];
853 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
854 PAGE_SIZE, DMA_FROM_DEVICE);
855 page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
856 len -= page_info->pad;
857 frag_size -= page_info->pad;
858
859 xprog = READ_ONCE(priv->xdp_prog);
860 if (xprog && is_only_frag) {
861 void *old_data;
862 int xdp_act;
863
864 xdp_init_buff(&xdp, page_info->buf_size, &rx->xdp_rxq);
865 xdp_prepare_buff(&xdp, page_info->page_address +
866 page_info->page_offset, GVE_RX_PAD,
867 len, false);
868 old_data = xdp.data;
869 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
870 if (xdp_act != XDP_PASS) {
871 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
872 ctx->total_size += frag_size;
873 goto finish_ok_pkt;
874 }
875
876 page_info->pad += xdp.data - old_data;
877 len = xdp.data_end - xdp.data;
878
879 u64_stats_update_begin(&rx->statss);
880 rx->xdp_actions[XDP_PASS]++;
881 u64_stats_update_end(&rx->statss);
882 }
883
884 skb = gve_rx_skb(priv, rx, page_info, napi, len,
885 data_slot, is_only_frag);
886 if (!skb) {
887 u64_stats_update_begin(&rx->statss);
888 rx->rx_skb_alloc_fail++;
889 u64_stats_update_end(&rx->statss);
890
891 napi_free_frags(napi);
892 ctx->drop_pkt = true;
893 goto finish_frag;
894 }
895 ctx->total_size += frag_size;
896
897 if (is_first_frag) {
898 if (likely(feat & NETIF_F_RXCSUM)) {
899 /* NIC passes up the partial sum */
900 if (desc->csum)
901 skb->ip_summed = CHECKSUM_COMPLETE;
902 else
903 skb->ip_summed = CHECKSUM_NONE;
904 skb->csum = csum_unfold(desc->csum);
905 }
906
907 /* parse flags & pass relevant info up */
908 if (likely(feat & NETIF_F_RXHASH) &&
909 gve_needs_rss(desc->flags_seq))
910 skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
911 gve_rss_type(desc->flags_seq));
912 }
913
914 if (is_last_frag) {
915 skb_record_rx_queue(skb, rx->q_num);
916 if (skb_is_nonlinear(skb))
917 napi_gro_frags(napi);
918 else
919 napi_gro_receive(napi, skb);
920 goto finish_ok_pkt;
921 }
922
923 goto finish_frag;
924
925 finish_ok_pkt:
926 cnts->ok_pkt_bytes += ctx->total_size;
927 cnts->ok_pkt_cnt++;
928 finish_frag:
929 ctx->frag_cnt++;
930 if (is_last_frag) {
931 cnts->total_pkt_cnt++;
932 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
933 gve_rx_ctx_clear(ctx);
934 }
935 }
936
gve_rx_work_pending(struct gve_rx_ring * rx)937 bool gve_rx_work_pending(struct gve_rx_ring *rx)
938 {
939 struct gve_rx_desc *desc;
940 __be16 flags_seq;
941 u32 next_idx;
942
943 next_idx = rx->cnt & rx->mask;
944 desc = rx->desc.desc_ring + next_idx;
945
946 flags_seq = desc->flags_seq;
947
948 return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
949 }
950
gve_rx_refill_buffers(struct gve_priv * priv,struct gve_rx_ring * rx)951 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
952 {
953 int refill_target = rx->mask + 1;
954 u32 fill_cnt = rx->fill_cnt;
955
956 while (fill_cnt - rx->cnt < refill_target) {
957 struct gve_rx_slot_page_info *page_info;
958 u32 idx = fill_cnt & rx->mask;
959
960 page_info = &rx->data.page_info[idx];
961 if (page_info->can_flip) {
962 /* The other half of the page is free because it was
963 * free when we processed the descriptor. Flip to it.
964 */
965 union gve_rx_data_slot *data_slot =
966 &rx->data.data_ring[idx];
967
968 gve_rx_flip_buff(page_info, &data_slot->addr);
969 page_info->can_flip = 0;
970 } else {
971 /* It is possible that the networking stack has already
972 * finished processing all outstanding packets in the buffer
973 * and it can be reused.
974 * Flipping is unnecessary here - if the networking stack still
975 * owns half the page it is impossible to tell which half. Either
976 * the whole page is free or it needs to be replaced.
977 */
978 int recycle = gve_rx_can_recycle_buffer(page_info);
979
980 if (recycle < 0) {
981 if (!rx->data.raw_addressing)
982 gve_schedule_reset(priv);
983 return false;
984 }
985 if (!recycle) {
986 /* We can't reuse the buffer - alloc a new one*/
987 union gve_rx_data_slot *data_slot =
988 &rx->data.data_ring[idx];
989 struct device *dev = &priv->pdev->dev;
990 gve_rx_free_buffer(dev, page_info, data_slot);
991 page_info->page = NULL;
992 if (gve_rx_alloc_buffer(priv, dev, page_info,
993 data_slot, rx)) {
994 break;
995 }
996 }
997 }
998 fill_cnt++;
999 }
1000 rx->fill_cnt = fill_cnt;
1001 return true;
1002 }
1003
gve_clean_rx_done(struct gve_rx_ring * rx,int budget,netdev_features_t feat)1004 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
1005 netdev_features_t feat)
1006 {
1007 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
1008 u64 xdp_txs = rx->xdp_actions[XDP_TX];
1009 struct gve_rx_ctx *ctx = &rx->ctx;
1010 struct gve_priv *priv = rx->gve;
1011 struct gve_rx_cnts cnts = {0};
1012 struct gve_rx_desc *next_desc;
1013 u32 idx = rx->cnt & rx->mask;
1014 u32 work_done = 0;
1015
1016 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
1017
1018 // Exceed budget only if (and till) the inflight packet is consumed.
1019 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
1020 (work_done < budget || ctx->frag_cnt)) {
1021 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
1022 prefetch(next_desc);
1023
1024 gve_rx(rx, feat, desc, idx, &cnts);
1025
1026 rx->cnt++;
1027 idx = rx->cnt & rx->mask;
1028 desc = &rx->desc.desc_ring[idx];
1029 rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
1030 work_done++;
1031 }
1032
1033 // The device will only send whole packets.
1034 if (unlikely(ctx->frag_cnt)) {
1035 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
1036
1037 napi_free_frags(napi);
1038 gve_rx_ctx_clear(&rx->ctx);
1039 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
1040 GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
1041 gve_schedule_reset(rx->gve);
1042 }
1043
1044 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
1045 return 0;
1046
1047 if (work_done) {
1048 u64_stats_update_begin(&rx->statss);
1049 rx->rpackets += cnts.ok_pkt_cnt;
1050 rx->rbytes += cnts.ok_pkt_bytes;
1051 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
1052 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
1053 u64_stats_update_end(&rx->statss);
1054 }
1055
1056 if (xdp_txs != rx->xdp_actions[XDP_TX])
1057 gve_xdp_tx_flush(priv, rx->q_num);
1058
1059 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1060 xdp_do_flush();
1061
1062 /* restock ring slots */
1063 if (!rx->data.raw_addressing) {
1064 /* In QPL mode buffs are refilled as the desc are processed */
1065 rx->fill_cnt += work_done;
1066 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
1067 /* In raw addressing mode buffs are only refilled if the avail
1068 * falls below a threshold.
1069 */
1070 if (!gve_rx_refill_buffers(priv, rx))
1071 return 0;
1072
1073 /* If we were not able to completely refill buffers, we'll want
1074 * to schedule this queue for work again to refill buffers.
1075 */
1076 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
1077 gve_rx_write_doorbell(priv, rx);
1078 return budget;
1079 }
1080 }
1081
1082 gve_rx_write_doorbell(priv, rx);
1083 return cnts.total_pkt_cnt;
1084 }
1085
gve_rx_poll(struct gve_notify_block * block,int budget)1086 int gve_rx_poll(struct gve_notify_block *block, int budget)
1087 {
1088 struct gve_rx_ring *rx = block->rx;
1089 netdev_features_t feat;
1090 int work_done = 0;
1091
1092 feat = block->napi.dev->features;
1093
1094 if (budget > 0)
1095 work_done = gve_clean_rx_done(rx, budget, feat);
1096
1097 return work_done;
1098 }
1099