1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <net/xdp.h>
13 #include <net/xdp_sock_drv.h>
14
gve_rx_free_buffer(struct device * dev,struct gve_rx_slot_page_info * page_info,union gve_rx_data_slot * data_slot)15 static void gve_rx_free_buffer(struct device *dev,
16 struct gve_rx_slot_page_info *page_info,
17 union gve_rx_data_slot *data_slot)
18 {
19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
20 GVE_DATA_SLOT_ADDR_PAGE_MASK);
21
22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
24 }
25
gve_rx_unfill_pages(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)26 static void gve_rx_unfill_pages(struct gve_priv *priv,
27 struct gve_rx_ring *rx,
28 struct gve_rx_alloc_rings_cfg *cfg)
29 {
30 u32 slots = rx->mask + 1;
31 int i;
32
33 if (!rx->data.page_info)
34 return;
35
36 if (rx->data.raw_addressing) {
37 for (i = 0; i < slots; i++)
38 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
39 &rx->data.data_ring[i]);
40 } else {
41 for (i = 0; i < slots; i++)
42 page_ref_sub(rx->data.page_info[i].page,
43 rx->data.page_info[i].pagecnt_bias - 1);
44
45 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
46 page_ref_sub(rx->qpl_copy_pool[i].page,
47 rx->qpl_copy_pool[i].pagecnt_bias - 1);
48 put_page(rx->qpl_copy_pool[i].page);
49 }
50 }
51 kvfree(rx->data.page_info);
52 rx->data.page_info = NULL;
53 }
54
gve_rx_ctx_clear(struct gve_rx_ctx * ctx)55 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
56 {
57 ctx->skb_head = NULL;
58 ctx->skb_tail = NULL;
59 ctx->total_size = 0;
60 ctx->frag_cnt = 0;
61 ctx->drop_pkt = false;
62 }
63
gve_rx_init_ring_state_gqi(struct gve_rx_ring * rx)64 static void gve_rx_init_ring_state_gqi(struct gve_rx_ring *rx)
65 {
66 rx->desc.seqno = 1;
67 rx->cnt = 0;
68 gve_rx_ctx_clear(&rx->ctx);
69 }
70
gve_rx_reset_ring_gqi(struct gve_priv * priv,int idx)71 static void gve_rx_reset_ring_gqi(struct gve_priv *priv, int idx)
72 {
73 struct gve_rx_ring *rx = &priv->rx[idx];
74 const u32 slots = priv->rx_desc_cnt;
75 size_t size;
76
77 /* Reset desc ring */
78 if (rx->desc.desc_ring) {
79 size = slots * sizeof(rx->desc.desc_ring[0]);
80 memset(rx->desc.desc_ring, 0, size);
81 }
82
83 /* Reset q_resources */
84 if (rx->q_resources)
85 memset(rx->q_resources, 0, sizeof(*rx->q_resources));
86
87 gve_rx_init_ring_state_gqi(rx);
88 }
89
gve_rx_stop_ring_gqi(struct gve_priv * priv,int idx)90 void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx)
91 {
92 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
93
94 if (!gve_rx_was_added_to_block(priv, idx))
95 return;
96
97 gve_remove_napi(priv, ntfy_idx);
98 gve_rx_remove_from_block(priv, idx);
99 gve_rx_reset_ring_gqi(priv, idx);
100 }
101
gve_rx_free_ring_gqi(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)102 void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx,
103 struct gve_rx_alloc_rings_cfg *cfg)
104 {
105 struct device *dev = &priv->pdev->dev;
106 u32 slots = rx->mask + 1;
107 int idx = rx->q_num;
108 size_t bytes;
109 u32 qpl_id;
110
111 if (rx->desc.desc_ring) {
112 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size;
113 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
114 rx->desc.desc_ring = NULL;
115 }
116
117 if (rx->q_resources) {
118 dma_free_coherent(dev, sizeof(*rx->q_resources),
119 rx->q_resources, rx->q_resources_bus);
120 rx->q_resources = NULL;
121 }
122
123 gve_rx_unfill_pages(priv, rx, cfg);
124
125 if (rx->data.data_ring) {
126 bytes = sizeof(*rx->data.data_ring) * slots;
127 dma_free_coherent(dev, bytes, rx->data.data_ring,
128 rx->data.data_bus);
129 rx->data.data_ring = NULL;
130 }
131
132 kvfree(rx->qpl_copy_pool);
133 rx->qpl_copy_pool = NULL;
134
135 if (rx->data.qpl) {
136 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, idx);
137 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id);
138 rx->data.qpl = NULL;
139 }
140
141 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
142 }
143
gve_setup_rx_buffer(struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,dma_addr_t addr,struct page * page,__be64 * slot_addr)144 static void gve_setup_rx_buffer(struct gve_rx_ring *rx,
145 struct gve_rx_slot_page_info *page_info,
146 dma_addr_t addr, struct page *page,
147 __be64 *slot_addr)
148 {
149 page_info->page = page;
150 page_info->page_offset = 0;
151 page_info->page_address = page_address(page);
152 page_info->buf_size = rx->packet_buffer_size;
153 *slot_addr = cpu_to_be64(addr);
154 /* The page already has 1 ref */
155 page_ref_add(page, INT_MAX - 1);
156 page_info->pagecnt_bias = INT_MAX;
157 }
158
gve_rx_alloc_buffer(struct gve_priv * priv,struct device * dev,struct gve_rx_slot_page_info * page_info,union gve_rx_data_slot * data_slot,struct gve_rx_ring * rx)159 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
160 struct gve_rx_slot_page_info *page_info,
161 union gve_rx_data_slot *data_slot,
162 struct gve_rx_ring *rx)
163 {
164 struct page *page;
165 dma_addr_t dma;
166 int err;
167
168 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
169 GFP_ATOMIC);
170 if (err) {
171 u64_stats_update_begin(&rx->statss);
172 rx->rx_buf_alloc_fail++;
173 u64_stats_update_end(&rx->statss);
174 return err;
175 }
176
177 gve_setup_rx_buffer(rx, page_info, dma, page, &data_slot->addr);
178 return 0;
179 }
180
gve_rx_prefill_pages(struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)181 static int gve_rx_prefill_pages(struct gve_rx_ring *rx,
182 struct gve_rx_alloc_rings_cfg *cfg)
183 {
184 struct gve_priv *priv = rx->gve;
185 u32 slots;
186 int err;
187 int i;
188 int j;
189
190 /* Allocate one page per Rx queue slot. Each page is split into two
191 * packet buffers, when possible we "page flip" between the two.
192 */
193 slots = rx->mask + 1;
194
195 rx->data.page_info = kvcalloc_node(slots, sizeof(*rx->data.page_info),
196 GFP_KERNEL, priv->numa_node);
197 if (!rx->data.page_info)
198 return -ENOMEM;
199
200 for (i = 0; i < slots; i++) {
201 if (!rx->data.raw_addressing) {
202 struct page *page = rx->data.qpl->pages[i];
203 dma_addr_t addr = i * PAGE_SIZE;
204
205 gve_setup_rx_buffer(rx, &rx->data.page_info[i], addr,
206 page,
207 &rx->data.data_ring[i].qpl_offset);
208 continue;
209 }
210 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev,
211 &rx->data.page_info[i],
212 &rx->data.data_ring[i], rx);
213 if (err)
214 goto alloc_err_rda;
215 }
216
217 if (!rx->data.raw_addressing) {
218 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
219 struct page *page = alloc_pages_node(priv->numa_node,
220 GFP_KERNEL, 0);
221
222 if (!page) {
223 err = -ENOMEM;
224 goto alloc_err_qpl;
225 }
226
227 rx->qpl_copy_pool[j].page = page;
228 rx->qpl_copy_pool[j].page_offset = 0;
229 rx->qpl_copy_pool[j].page_address = page_address(page);
230 rx->qpl_copy_pool[j].buf_size = rx->packet_buffer_size;
231
232 /* The page already has 1 ref. */
233 page_ref_add(page, INT_MAX - 1);
234 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
235 }
236 }
237
238 return slots;
239
240 alloc_err_qpl:
241 /* Fully free the copy pool pages. */
242 while (j--) {
243 page_ref_sub(rx->qpl_copy_pool[j].page,
244 rx->qpl_copy_pool[j].pagecnt_bias - 1);
245 put_page(rx->qpl_copy_pool[j].page);
246 }
247
248 /* Do not fully free QPL pages - only remove the bias added in this
249 * function with gve_setup_rx_buffer.
250 */
251 while (i--)
252 page_ref_sub(rx->data.page_info[i].page,
253 rx->data.page_info[i].pagecnt_bias - 1);
254
255 return err;
256
257 alloc_err_rda:
258 while (i--)
259 gve_rx_free_buffer(&priv->pdev->dev,
260 &rx->data.page_info[i],
261 &rx->data.data_ring[i]);
262 return err;
263 }
264
gve_rx_start_ring_gqi(struct gve_priv * priv,int idx)265 void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx)
266 {
267 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
268
269 gve_rx_add_to_block(priv, idx);
270 gve_add_napi(priv, ntfy_idx, gve_napi_poll);
271 }
272
gve_rx_alloc_ring_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg,struct gve_rx_ring * rx,int idx)273 int gve_rx_alloc_ring_gqi(struct gve_priv *priv,
274 struct gve_rx_alloc_rings_cfg *cfg,
275 struct gve_rx_ring *rx,
276 int idx)
277 {
278 struct device *hdev = &priv->pdev->dev;
279 u32 slots = cfg->ring_size;
280 int filled_pages;
281 int qpl_page_cnt;
282 u32 qpl_id = 0;
283 size_t bytes;
284 int err;
285
286 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
287 /* Make sure everything is zeroed to start with */
288 memset(rx, 0, sizeof(*rx));
289
290 rx->gve = priv;
291 rx->q_num = idx;
292 rx->packet_buffer_size = cfg->packet_buffer_size;
293
294 rx->mask = slots - 1;
295 rx->data.raw_addressing = cfg->raw_addressing;
296
297 /* alloc rx data ring */
298 bytes = sizeof(*rx->data.data_ring) * slots;
299 rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
300 &rx->data.data_bus,
301 GFP_KERNEL);
302 if (!rx->data.data_ring)
303 return -ENOMEM;
304
305 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
306 rx->qpl_copy_pool_head = 0;
307 rx->qpl_copy_pool = kvcalloc_node(rx->qpl_copy_pool_mask + 1,
308 sizeof(rx->qpl_copy_pool[0]),
309 GFP_KERNEL, priv->numa_node);
310 if (!rx->qpl_copy_pool) {
311 err = -ENOMEM;
312 goto abort_with_slots;
313 }
314
315 if (!rx->data.raw_addressing) {
316 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
317 qpl_page_cnt = cfg->ring_size;
318
319 rx->data.qpl = gve_alloc_queue_page_list(priv, qpl_id,
320 qpl_page_cnt);
321 if (!rx->data.qpl) {
322 err = -ENOMEM;
323 goto abort_with_copy_pool;
324 }
325 }
326
327 filled_pages = gve_rx_prefill_pages(rx, cfg);
328 if (filled_pages < 0) {
329 err = -ENOMEM;
330 goto abort_with_qpl;
331 }
332 rx->fill_cnt = filled_pages;
333 /* Ensure data ring slots (packet buffers) are visible. */
334 dma_wmb();
335
336 /* Alloc gve_queue_resources */
337 rx->q_resources =
338 dma_alloc_coherent(hdev,
339 sizeof(*rx->q_resources),
340 &rx->q_resources_bus,
341 GFP_KERNEL);
342 if (!rx->q_resources) {
343 err = -ENOMEM;
344 goto abort_filled;
345 }
346 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
347 (unsigned long)rx->data.data_bus);
348
349 /* alloc rx desc ring */
350 bytes = sizeof(struct gve_rx_desc) * cfg->ring_size;
351 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
352 GFP_KERNEL);
353 if (!rx->desc.desc_ring) {
354 err = -ENOMEM;
355 goto abort_with_q_resources;
356 }
357 rx->db_threshold = slots / 2;
358 gve_rx_init_ring_state_gqi(rx);
359
360 gve_rx_ctx_clear(&rx->ctx);
361
362 return 0;
363
364 abort_with_q_resources:
365 dma_free_coherent(hdev, sizeof(*rx->q_resources),
366 rx->q_resources, rx->q_resources_bus);
367 rx->q_resources = NULL;
368 abort_filled:
369 gve_rx_unfill_pages(priv, rx, cfg);
370 abort_with_qpl:
371 if (!rx->data.raw_addressing) {
372 gve_free_queue_page_list(priv, rx->data.qpl, qpl_id);
373 rx->data.qpl = NULL;
374 }
375 abort_with_copy_pool:
376 kvfree(rx->qpl_copy_pool);
377 rx->qpl_copy_pool = NULL;
378 abort_with_slots:
379 bytes = sizeof(*rx->data.data_ring) * slots;
380 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
381 rx->data.data_ring = NULL;
382
383 return err;
384 }
385
gve_rx_alloc_rings_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)386 int gve_rx_alloc_rings_gqi(struct gve_priv *priv,
387 struct gve_rx_alloc_rings_cfg *cfg)
388 {
389 struct gve_rx_ring *rx;
390 int err = 0;
391 int i, j;
392
393 rx = kvzalloc_objs(struct gve_rx_ring, cfg->qcfg_rx->max_queues);
394 if (!rx)
395 return -ENOMEM;
396
397 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
398 err = gve_rx_alloc_ring_gqi(priv, cfg, &rx[i], i);
399 if (err) {
400 netif_err(priv, drv, priv->dev,
401 "Failed to alloc rx ring=%d: err=%d\n",
402 i, err);
403 goto cleanup;
404 }
405 }
406
407 cfg->rx = rx;
408 return 0;
409
410 cleanup:
411 for (j = 0; j < i; j++)
412 gve_rx_free_ring_gqi(priv, &rx[j], cfg);
413 kvfree(rx);
414 return err;
415 }
416
gve_rx_free_rings_gqi(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)417 void gve_rx_free_rings_gqi(struct gve_priv *priv,
418 struct gve_rx_alloc_rings_cfg *cfg)
419 {
420 struct gve_rx_ring *rx = cfg->rx;
421 int i;
422
423 if (!rx)
424 return;
425
426 for (i = 0; i < cfg->qcfg_rx->num_queues; i++)
427 gve_rx_free_ring_gqi(priv, &rx[i], cfg);
428
429 kvfree(rx);
430 cfg->rx = NULL;
431 }
432
gve_rx_write_doorbell(struct gve_priv * priv,struct gve_rx_ring * rx)433 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
434 {
435 u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
436
437 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
438 }
439
gve_rss_type(__be16 pkt_flags)440 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
441 {
442 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
443 return PKT_HASH_TYPE_L4;
444 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
445 return PKT_HASH_TYPE_L3;
446 return PKT_HASH_TYPE_L2;
447 }
448
gve_rx_add_frags(struct napi_struct * napi,struct gve_rx_slot_page_info * page_info,unsigned int truesize,u16 len,struct gve_rx_ctx * ctx)449 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
450 struct gve_rx_slot_page_info *page_info,
451 unsigned int truesize, u16 len,
452 struct gve_rx_ctx *ctx)
453 {
454 u32 offset = page_info->page_offset + page_info->pad;
455 struct sk_buff *skb = ctx->skb_tail;
456 int num_frags = 0;
457
458 if (!skb) {
459 skb = napi_get_frags(napi);
460 if (unlikely(!skb))
461 return NULL;
462
463 ctx->skb_head = skb;
464 ctx->skb_tail = skb;
465 } else {
466 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
467 if (num_frags == MAX_SKB_FRAGS) {
468 skb = napi_alloc_skb(napi, 0);
469 if (!skb)
470 return NULL;
471
472 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k
473 // which is why we do not need to chain by using skb->next
474 skb_shinfo(ctx->skb_tail)->frag_list = skb;
475
476 ctx->skb_tail = skb;
477 num_frags = 0;
478 }
479 }
480
481 if (skb != ctx->skb_head) {
482 ctx->skb_head->len += len;
483 ctx->skb_head->data_len += len;
484 ctx->skb_head->truesize += truesize;
485 }
486 skb_add_rx_frag(skb, num_frags, page_info->page,
487 offset, len, truesize);
488
489 return ctx->skb_head;
490 }
491
gve_rx_flip_buff(struct gve_rx_slot_page_info * page_info,__be64 * slot_addr)492 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
493 {
494 const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET);
495
496 /* "flip" to other packet buffer on this page */
497 page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
498 *(slot_addr) ^= offset;
499 }
500
gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info * page_info)501 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
502 {
503 int pagecount = page_count(page_info->page);
504
505 /* This page is not being used by any SKBs - reuse */
506 if (pagecount == page_info->pagecnt_bias)
507 return 1;
508 /* This page is still being used by an SKB - we can't reuse */
509 else if (pagecount > page_info->pagecnt_bias)
510 return 0;
511 WARN(pagecount < page_info->pagecnt_bias,
512 "Pagecount should never be less than the bias.");
513 return -1;
514 }
515
516 static struct sk_buff *
gve_rx_raw_addressing(struct device * dev,struct net_device * netdev,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi,union gve_rx_data_slot * data_slot,u16 packet_buffer_size,struct gve_rx_ctx * ctx)517 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
518 struct gve_rx_slot_page_info *page_info, u16 len,
519 struct napi_struct *napi,
520 union gve_rx_data_slot *data_slot,
521 u16 packet_buffer_size, struct gve_rx_ctx *ctx)
522 {
523 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
524
525 if (!skb)
526 return NULL;
527
528 /* Optimistically stop the kernel from freeing the page.
529 * We will check again in refill to determine if we need to alloc a
530 * new page.
531 */
532 gve_dec_pagecnt_bias(page_info);
533
534 return skb;
535 }
536
gve_rx_copy_to_pool(struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi)537 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
538 struct gve_rx_slot_page_info *page_info,
539 u16 len, struct napi_struct *napi)
540 {
541 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
542 void *src = page_info->page_address + page_info->page_offset;
543 struct gve_rx_slot_page_info *copy_page_info;
544 struct gve_rx_ctx *ctx = &rx->ctx;
545 bool alloc_page = false;
546 struct sk_buff *skb;
547 void *dst;
548
549 copy_page_info = &rx->qpl_copy_pool[pool_idx];
550 if (!copy_page_info->can_flip) {
551 int recycle = gve_rx_can_recycle_buffer(copy_page_info);
552
553 if (unlikely(recycle < 0)) {
554 gve_schedule_reset(rx->gve);
555 return NULL;
556 }
557 alloc_page = !recycle;
558 }
559
560 if (alloc_page) {
561 struct gve_rx_slot_page_info alloc_page_info;
562 struct page *page;
563
564 /* The least recently used page turned out to be
565 * still in use by the kernel. Ignoring it and moving
566 * on alleviates head-of-line blocking.
567 */
568 rx->qpl_copy_pool_head++;
569
570 page = alloc_page(GFP_ATOMIC);
571 if (!page)
572 return NULL;
573
574 alloc_page_info.page = page;
575 alloc_page_info.page_offset = 0;
576 alloc_page_info.page_address = page_address(page);
577 alloc_page_info.pad = page_info->pad;
578
579 memcpy(alloc_page_info.page_address, src, page_info->pad + len);
580 skb = gve_rx_add_frags(napi, &alloc_page_info,
581 PAGE_SIZE,
582 len, ctx);
583
584 u64_stats_update_begin(&rx->statss);
585 rx->rx_frag_copy_cnt++;
586 rx->rx_frag_alloc_cnt++;
587 u64_stats_update_end(&rx->statss);
588
589 return skb;
590 }
591
592 dst = copy_page_info->page_address + copy_page_info->page_offset;
593 memcpy(dst, src, page_info->pad + len);
594 copy_page_info->pad = page_info->pad;
595
596 skb = gve_rx_add_frags(napi, copy_page_info,
597 copy_page_info->buf_size, len, ctx);
598 if (unlikely(!skb))
599 return NULL;
600
601 gve_dec_pagecnt_bias(copy_page_info);
602 copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
603
604 if (copy_page_info->can_flip) {
605 /* We have used both halves of this copy page, it
606 * is time for it to go to the back of the queue.
607 */
608 copy_page_info->can_flip = false;
609 rx->qpl_copy_pool_head++;
610 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
611 } else {
612 copy_page_info->can_flip = true;
613 }
614
615 u64_stats_update_begin(&rx->statss);
616 rx->rx_frag_copy_cnt++;
617 u64_stats_update_end(&rx->statss);
618
619 return skb;
620 }
621
622 static struct sk_buff *
gve_rx_qpl(struct device * dev,struct net_device * netdev,struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,u16 len,struct napi_struct * napi,union gve_rx_data_slot * data_slot)623 gve_rx_qpl(struct device *dev, struct net_device *netdev,
624 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
625 u16 len, struct napi_struct *napi,
626 union gve_rx_data_slot *data_slot)
627 {
628 struct gve_rx_ctx *ctx = &rx->ctx;
629 struct sk_buff *skb;
630
631 /* if raw_addressing mode is not enabled gvnic can only receive into
632 * registered segments. If the buffer can't be recycled, our only
633 * choice is to copy the data out of it so that we can return it to the
634 * device.
635 */
636 if (page_info->can_flip) {
637 skb = gve_rx_add_frags(napi, page_info, page_info->buf_size,
638 len, ctx);
639 /* No point in recycling if we didn't get the skb */
640 if (skb) {
641 /* Make sure that the page isn't freed. */
642 gve_dec_pagecnt_bias(page_info);
643 gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
644 }
645 } else {
646 skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
647 }
648 return skb;
649 }
650
gve_rx_skb(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_slot_page_info * page_info,struct napi_struct * napi,u16 len,union gve_rx_data_slot * data_slot,bool is_only_frag)651 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
652 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
653 u16 len, union gve_rx_data_slot *data_slot,
654 bool is_only_frag)
655 {
656 struct net_device *netdev = priv->dev;
657 struct gve_rx_ctx *ctx = &rx->ctx;
658 struct sk_buff *skb = NULL;
659
660 if (len <= priv->rx_copybreak && is_only_frag) {
661 /* Just copy small packets */
662 skb = gve_rx_copy(netdev, napi, page_info, len);
663 if (skb) {
664 u64_stats_update_begin(&rx->statss);
665 rx->rx_copied_pkt++;
666 rx->rx_frag_copy_cnt++;
667 rx->rx_copybreak_pkt++;
668 u64_stats_update_end(&rx->statss);
669 }
670 } else {
671 int recycle = gve_rx_can_recycle_buffer(page_info);
672
673 if (unlikely(recycle < 0)) {
674 gve_schedule_reset(priv);
675 return NULL;
676 }
677 page_info->can_flip = recycle;
678 if (page_info->can_flip) {
679 u64_stats_update_begin(&rx->statss);
680 rx->rx_frag_flip_cnt++;
681 u64_stats_update_end(&rx->statss);
682 }
683
684 if (rx->data.raw_addressing) {
685 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
686 page_info, len, napi,
687 data_slot,
688 page_info->buf_size, ctx);
689 } else {
690 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
691 page_info, len, napi, data_slot);
692 }
693 }
694 return skb;
695 }
696
gve_xsk_pool_redirect(struct net_device * dev,struct gve_rx_ring * rx,void * data,int len,struct bpf_prog * xdp_prog)697 static int gve_xsk_pool_redirect(struct net_device *dev,
698 struct gve_rx_ring *rx,
699 void *data, int len,
700 struct bpf_prog *xdp_prog)
701 {
702 struct xdp_buff *xdp;
703 int err;
704
705 if (rx->xsk_pool->frame_len < len)
706 return -E2BIG;
707 xdp = xsk_buff_alloc(rx->xsk_pool);
708 if (!xdp) {
709 u64_stats_update_begin(&rx->statss);
710 rx->xdp_alloc_fails++;
711 u64_stats_update_end(&rx->statss);
712 return -ENOMEM;
713 }
714 xdp->data_end = xdp->data + len;
715 memcpy(xdp->data, data, len);
716 err = xdp_do_redirect(dev, xdp, xdp_prog);
717 if (err)
718 xsk_buff_free(xdp);
719 return err;
720 }
721
gve_xdp_redirect(struct net_device * dev,struct gve_rx_ring * rx,struct xdp_buff * orig,struct bpf_prog * xdp_prog)722 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
723 struct xdp_buff *orig, struct bpf_prog *xdp_prog)
724 {
725 int total_len, len = orig->data_end - orig->data;
726 int headroom = XDP_PACKET_HEADROOM;
727 struct xdp_buff new;
728 void *frame;
729 int err;
730
731 if (rx->xsk_pool)
732 return gve_xsk_pool_redirect(dev, rx, orig->data,
733 len, xdp_prog);
734
735 total_len = headroom + SKB_DATA_ALIGN(len) +
736 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
737 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
738 if (!frame) {
739 u64_stats_update_begin(&rx->statss);
740 rx->xdp_alloc_fails++;
741 u64_stats_update_end(&rx->statss);
742 return -ENOMEM;
743 }
744 xdp_init_buff(&new, total_len, &rx->xdp_rxq);
745 xdp_prepare_buff(&new, frame, headroom, len, false);
746 memcpy(new.data, orig->data, len);
747
748 err = xdp_do_redirect(dev, &new, xdp_prog);
749 if (err)
750 page_frag_free(frame);
751
752 return err;
753 }
754
gve_xdp_done(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act)755 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
756 struct xdp_buff *xdp, struct bpf_prog *xprog,
757 int xdp_act)
758 {
759 struct gve_tx_ring *tx;
760 int tx_qid;
761 int err;
762
763 switch (xdp_act) {
764 case XDP_ABORTED:
765 case XDP_DROP:
766 default:
767 break;
768 case XDP_TX:
769 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
770 tx = &priv->tx[tx_qid];
771 spin_lock(&tx->xdp_lock);
772 err = gve_xdp_xmit_one(priv, tx, xdp->data,
773 xdp->data_end - xdp->data, NULL);
774 spin_unlock(&tx->xdp_lock);
775
776 if (unlikely(err)) {
777 u64_stats_update_begin(&rx->statss);
778 rx->xdp_tx_errors++;
779 u64_stats_update_end(&rx->statss);
780 }
781 break;
782 case XDP_REDIRECT:
783 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
784
785 if (unlikely(err)) {
786 u64_stats_update_begin(&rx->statss);
787 rx->xdp_redirect_errors++;
788 u64_stats_update_end(&rx->statss);
789 }
790 break;
791 }
792 u64_stats_update_begin(&rx->statss);
793 if ((u32)xdp_act < GVE_XDP_ACTIONS)
794 rx->xdp_actions[xdp_act]++;
795 u64_stats_update_end(&rx->statss);
796 }
797
798 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
gve_rx(struct gve_rx_ring * rx,netdev_features_t feat,struct gve_rx_desc * desc,u32 idx,struct gve_rx_cnts * cnts)799 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
800 struct gve_rx_desc *desc, u32 idx,
801 struct gve_rx_cnts *cnts)
802 {
803 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
804 struct gve_rx_slot_page_info *page_info;
805 u16 frag_size = be16_to_cpu(desc->len);
806 struct gve_rx_ctx *ctx = &rx->ctx;
807 union gve_rx_data_slot *data_slot;
808 struct gve_priv *priv = rx->gve;
809 struct sk_buff *skb = NULL;
810 struct bpf_prog *xprog;
811 struct xdp_buff xdp;
812 dma_addr_t page_bus;
813 void *va;
814
815 u16 len = frag_size;
816 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
817 bool is_first_frag = ctx->frag_cnt == 0;
818
819 bool is_only_frag = is_first_frag && is_last_frag;
820
821 if (unlikely(ctx->drop_pkt))
822 goto finish_frag;
823
824 if (desc->flags_seq & GVE_RXF_ERR) {
825 ctx->drop_pkt = true;
826 cnts->desc_err_pkt_cnt++;
827 napi_free_frags(napi);
828 goto finish_frag;
829 }
830
831 if (unlikely(frag_size > rx->packet_buffer_size)) {
832 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
833 frag_size, rx->packet_buffer_size);
834 ctx->drop_pkt = true;
835 napi_free_frags(napi);
836 gve_schedule_reset(rx->gve);
837 goto finish_frag;
838 }
839
840 /* Prefetch two packet buffers ahead, we will need it soon. */
841 page_info = &rx->data.page_info[(idx + 2) & rx->mask];
842 va = page_info->page_address + page_info->page_offset;
843 prefetch(page_info->page); /* Kernel page struct. */
844 prefetch(va); /* Packet header. */
845 prefetch(va + 64); /* Next cacheline too. */
846
847 page_info = &rx->data.page_info[idx];
848 data_slot = &rx->data.data_ring[idx];
849 page_bus = (rx->data.raw_addressing) ?
850 be64_to_cpu(data_slot->addr) - page_info->page_offset :
851 rx->data.qpl->page_buses[idx];
852 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
853 PAGE_SIZE, DMA_FROM_DEVICE);
854 page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
855 len -= page_info->pad;
856 frag_size -= page_info->pad;
857
858 xprog = READ_ONCE(priv->xdp_prog);
859 if (xprog && is_only_frag) {
860 void *old_data;
861 int xdp_act;
862
863 xdp_init_buff(&xdp, page_info->buf_size, &rx->xdp_rxq);
864 xdp_prepare_buff(&xdp, page_info->page_address +
865 page_info->page_offset, GVE_RX_PAD,
866 len, false);
867 old_data = xdp.data;
868 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
869 if (xdp_act != XDP_PASS) {
870 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
871 ctx->total_size += frag_size;
872 goto finish_ok_pkt;
873 }
874
875 page_info->pad += xdp.data - old_data;
876 len = xdp.data_end - xdp.data;
877
878 u64_stats_update_begin(&rx->statss);
879 rx->xdp_actions[XDP_PASS]++;
880 u64_stats_update_end(&rx->statss);
881 }
882
883 skb = gve_rx_skb(priv, rx, page_info, napi, len,
884 data_slot, is_only_frag);
885 if (!skb) {
886 u64_stats_update_begin(&rx->statss);
887 rx->rx_skb_alloc_fail++;
888 u64_stats_update_end(&rx->statss);
889
890 napi_free_frags(napi);
891 ctx->drop_pkt = true;
892 goto finish_frag;
893 }
894 ctx->total_size += frag_size;
895
896 if (is_first_frag) {
897 if (likely(feat & NETIF_F_RXCSUM)) {
898 /* NIC passes up the partial sum */
899 if (desc->csum)
900 skb->ip_summed = CHECKSUM_COMPLETE;
901 else
902 skb->ip_summed = CHECKSUM_NONE;
903 skb->csum = csum_unfold(desc->csum);
904 }
905
906 /* parse flags & pass relevant info up */
907 if (likely(feat & NETIF_F_RXHASH) &&
908 gve_needs_rss(desc->flags_seq))
909 skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
910 gve_rss_type(desc->flags_seq));
911 }
912
913 if (is_last_frag) {
914 skb_record_rx_queue(skb, rx->q_num);
915 if (skb_is_nonlinear(skb))
916 napi_gro_frags(napi);
917 else
918 napi_gro_receive(napi, skb);
919 goto finish_ok_pkt;
920 }
921
922 goto finish_frag;
923
924 finish_ok_pkt:
925 cnts->ok_pkt_bytes += ctx->total_size;
926 cnts->ok_pkt_cnt++;
927 finish_frag:
928 ctx->frag_cnt++;
929 if (is_last_frag) {
930 cnts->total_pkt_cnt++;
931 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
932 gve_rx_ctx_clear(ctx);
933 }
934 }
935
gve_rx_work_pending(struct gve_rx_ring * rx)936 bool gve_rx_work_pending(struct gve_rx_ring *rx)
937 {
938 struct gve_rx_desc *desc;
939 __be16 flags_seq;
940 u32 next_idx;
941
942 next_idx = rx->cnt & rx->mask;
943 desc = rx->desc.desc_ring + next_idx;
944
945 flags_seq = desc->flags_seq;
946
947 return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
948 }
949
gve_rx_refill_buffers(struct gve_priv * priv,struct gve_rx_ring * rx)950 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
951 {
952 int refill_target = rx->mask + 1;
953 u32 fill_cnt = rx->fill_cnt;
954
955 while (fill_cnt - rx->cnt < refill_target) {
956 struct gve_rx_slot_page_info *page_info;
957 u32 idx = fill_cnt & rx->mask;
958
959 page_info = &rx->data.page_info[idx];
960 if (page_info->can_flip) {
961 /* The other half of the page is free because it was
962 * free when we processed the descriptor. Flip to it.
963 */
964 union gve_rx_data_slot *data_slot =
965 &rx->data.data_ring[idx];
966
967 gve_rx_flip_buff(page_info, &data_slot->addr);
968 page_info->can_flip = 0;
969 } else {
970 /* It is possible that the networking stack has already
971 * finished processing all outstanding packets in the buffer
972 * and it can be reused.
973 * Flipping is unnecessary here - if the networking stack still
974 * owns half the page it is impossible to tell which half. Either
975 * the whole page is free or it needs to be replaced.
976 */
977 int recycle = gve_rx_can_recycle_buffer(page_info);
978
979 if (recycle < 0) {
980 if (!rx->data.raw_addressing)
981 gve_schedule_reset(priv);
982 return false;
983 }
984 if (!recycle) {
985 /* We can't reuse the buffer - alloc a new one*/
986 union gve_rx_data_slot *data_slot =
987 &rx->data.data_ring[idx];
988 struct device *dev = &priv->pdev->dev;
989 gve_rx_free_buffer(dev, page_info, data_slot);
990 page_info->page = NULL;
991 if (gve_rx_alloc_buffer(priv, dev, page_info,
992 data_slot, rx)) {
993 break;
994 }
995 }
996 }
997 fill_cnt++;
998 }
999 rx->fill_cnt = fill_cnt;
1000 return true;
1001 }
1002
gve_clean_rx_done(struct gve_rx_ring * rx,int budget,netdev_features_t feat)1003 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
1004 netdev_features_t feat)
1005 {
1006 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
1007 u64 xdp_txs = rx->xdp_actions[XDP_TX];
1008 struct gve_rx_ctx *ctx = &rx->ctx;
1009 struct gve_priv *priv = rx->gve;
1010 struct gve_rx_cnts cnts = {0};
1011 struct gve_rx_desc *next_desc;
1012 u32 idx = rx->cnt & rx->mask;
1013 u32 work_done = 0;
1014
1015 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
1016
1017 // Exceed budget only if (and till) the inflight packet is consumed.
1018 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
1019 (work_done < budget || ctx->frag_cnt)) {
1020 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
1021 prefetch(next_desc);
1022
1023 gve_rx(rx, feat, desc, idx, &cnts);
1024
1025 rx->cnt++;
1026 idx = rx->cnt & rx->mask;
1027 desc = &rx->desc.desc_ring[idx];
1028 rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
1029 work_done++;
1030 }
1031
1032 // The device will only send whole packets.
1033 if (unlikely(ctx->frag_cnt)) {
1034 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
1035
1036 napi_free_frags(napi);
1037 gve_rx_ctx_clear(&rx->ctx);
1038 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
1039 GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
1040 gve_schedule_reset(rx->gve);
1041 }
1042
1043 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
1044 return 0;
1045
1046 if (work_done) {
1047 u64_stats_update_begin(&rx->statss);
1048 rx->rpackets += cnts.ok_pkt_cnt;
1049 rx->rbytes += cnts.ok_pkt_bytes;
1050 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
1051 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
1052 u64_stats_update_end(&rx->statss);
1053 }
1054
1055 if (xdp_txs != rx->xdp_actions[XDP_TX])
1056 gve_xdp_tx_flush(priv, rx->q_num);
1057
1058 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1059 xdp_do_flush();
1060
1061 /* restock ring slots */
1062 if (!rx->data.raw_addressing) {
1063 /* In QPL mode buffs are refilled as the desc are processed */
1064 rx->fill_cnt += work_done;
1065 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
1066 /* In raw addressing mode buffs are only refilled if the avail
1067 * falls below a threshold.
1068 */
1069 if (!gve_rx_refill_buffers(priv, rx))
1070 return 0;
1071
1072 /* If we were not able to completely refill buffers, we'll want
1073 * to schedule this queue for work again to refill buffers.
1074 */
1075 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
1076 gve_rx_write_doorbell(priv, rx);
1077 return budget;
1078 }
1079 }
1080
1081 gve_rx_write_doorbell(priv, rx);
1082 return cnts.total_pkt_cnt;
1083 }
1084
gve_rx_poll(struct gve_notify_block * block,int budget)1085 int gve_rx_poll(struct gve_notify_block *block, int budget)
1086 {
1087 struct gve_rx_ring *rx = block->rx;
1088 netdev_features_t feat;
1089 int work_done = 0;
1090
1091 feat = block->napi.dev->features;
1092
1093 if (budget > 0)
1094 work_done = gve_clean_rx_done(rx, budget, feat);
1095
1096 return work_done;
1097 }
1098