1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/bpf.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/skbuff.h>
15 #include <linux/slab.h>
16 #include <net/ip6_checksum.h>
17 #include <net/ipv6.h>
18 #include <net/tcp.h>
19 #include <net/xdp_sock_drv.h>
20
gve_rx_free_hdr_bufs(struct gve_priv * priv,struct gve_rx_ring * rx)21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
22 {
23 struct device *hdev = &priv->pdev->dev;
24 int buf_count = rx->dqo.bufq.mask + 1;
25
26 if (rx->dqo.hdr_bufs.data) {
27 dma_free_coherent(hdev, priv->header_buf_size * buf_count,
28 rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
29 rx->dqo.hdr_bufs.data = NULL;
30 }
31 }
32
gve_rx_init_ring_state_dqo(struct gve_rx_ring * rx,const u32 buffer_queue_slots,const u32 completion_queue_slots)33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
34 const u32 buffer_queue_slots,
35 const u32 completion_queue_slots)
36 {
37 int i;
38
39 /* Set buffer queue state */
40 rx->dqo.bufq.mask = buffer_queue_slots - 1;
41 rx->dqo.bufq.head = 0;
42 rx->dqo.bufq.tail = 0;
43
44 /* Set completion queue state */
45 rx->dqo.complq.num_free_slots = completion_queue_slots;
46 rx->dqo.complq.mask = completion_queue_slots - 1;
47 rx->dqo.complq.cur_gen_bit = 0;
48 rx->dqo.complq.head = 0;
49
50 /* Set RX SKB context */
51 rx->ctx.skb_head = NULL;
52 rx->ctx.skb_tail = NULL;
53
54 /* Set up linked list of buffer IDs */
55 if (rx->dqo.buf_states) {
56 for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
57 rx->dqo.buf_states[i].next = i + 1;
58 rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
59 }
60
61 rx->dqo.free_buf_states = 0;
62 rx->dqo.recycled_buf_states.head = -1;
63 rx->dqo.recycled_buf_states.tail = -1;
64 rx->dqo.used_buf_states.head = -1;
65 rx->dqo.used_buf_states.tail = -1;
66 }
67
gve_rx_reset_ring_dqo(struct gve_priv * priv,int idx)68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
69 {
70 struct gve_rx_ring *rx = &priv->rx[idx];
71 size_t size;
72 int i;
73
74 const u32 buffer_queue_slots = priv->rx_desc_cnt;
75 const u32 completion_queue_slots = priv->rx_desc_cnt;
76
77 /* Reset buffer queue */
78 if (rx->dqo.bufq.desc_ring) {
79 size = sizeof(rx->dqo.bufq.desc_ring[0]) *
80 buffer_queue_slots;
81 memset(rx->dqo.bufq.desc_ring, 0, size);
82 }
83
84 /* Reset completion queue */
85 if (rx->dqo.complq.desc_ring) {
86 size = sizeof(rx->dqo.complq.desc_ring[0]) *
87 completion_queue_slots;
88 memset(rx->dqo.complq.desc_ring, 0, size);
89 }
90
91 /* Reset q_resources */
92 if (rx->q_resources)
93 memset(rx->q_resources, 0, sizeof(*rx->q_resources));
94
95 /* Reset buf states */
96 if (rx->dqo.buf_states) {
97 for (i = 0; i < rx->dqo.num_buf_states; i++) {
98 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
99
100 if (rx->dqo.page_pool)
101 gve_free_to_page_pool(rx, bs, false);
102 else
103 gve_free_qpl_page_dqo(bs);
104 }
105 }
106
107 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
108 completion_queue_slots);
109 }
110
gve_rx_stop_ring_dqo(struct gve_priv * priv,int idx)111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
112 {
113 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
114 struct gve_rx_ring *rx = &priv->rx[idx];
115
116 if (!gve_rx_was_added_to_block(priv, idx))
117 return;
118
119 if (rx->dqo.page_pool)
120 page_pool_disable_direct_recycling(rx->dqo.page_pool);
121 gve_remove_napi(priv, ntfy_idx);
122 gve_rx_remove_from_block(priv, idx);
123 gve_rx_reset_ring_dqo(priv, idx);
124 }
125
gve_rx_free_ring_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
127 struct gve_rx_alloc_rings_cfg *cfg)
128 {
129 struct device *hdev = &priv->pdev->dev;
130 size_t completion_queue_slots;
131 size_t buffer_queue_slots;
132 int idx = rx->q_num;
133 size_t size;
134 u32 qpl_id;
135 int i;
136
137 completion_queue_slots = rx->dqo.complq.mask + 1;
138 buffer_queue_slots = rx->dqo.bufq.mask + 1;
139
140 if (rx->q_resources) {
141 dma_free_coherent(hdev, sizeof(*rx->q_resources),
142 rx->q_resources, rx->q_resources_bus);
143 rx->q_resources = NULL;
144 }
145
146 for (i = 0; i < rx->dqo.num_buf_states; i++) {
147 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
148
149 if (rx->dqo.page_pool)
150 gve_free_to_page_pool(rx, bs, false);
151 else
152 gve_free_qpl_page_dqo(bs);
153 if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) {
154 xsk_buff_free(bs->xsk_buff);
155 bs->xsk_buff = NULL;
156 }
157 }
158
159 if (rx->dqo.qpl) {
160 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
161 gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
162 rx->dqo.qpl = NULL;
163 }
164
165 if (rx->dqo.bufq.desc_ring) {
166 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
167 dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
168 rx->dqo.bufq.bus);
169 rx->dqo.bufq.desc_ring = NULL;
170 }
171
172 if (rx->dqo.complq.desc_ring) {
173 size = sizeof(rx->dqo.complq.desc_ring[0]) *
174 completion_queue_slots;
175 dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
176 rx->dqo.complq.bus);
177 rx->dqo.complq.desc_ring = NULL;
178 }
179
180 kvfree(rx->dqo.buf_states);
181 rx->dqo.buf_states = NULL;
182
183 if (rx->dqo.page_pool) {
184 page_pool_destroy(rx->dqo.page_pool);
185 rx->dqo.page_pool = NULL;
186 }
187
188 gve_rx_free_hdr_bufs(priv, rx);
189
190 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
191 }
192
gve_rx_alloc_hdr_bufs(struct gve_priv * priv,struct gve_rx_ring * rx,const u32 buf_count)193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
194 const u32 buf_count)
195 {
196 struct device *hdev = &priv->pdev->dev;
197
198 rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
199 &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
200 if (!rx->dqo.hdr_bufs.data)
201 return -ENOMEM;
202
203 return 0;
204 }
205
gve_rx_start_ring_dqo(struct gve_priv * priv,int idx)206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
207 {
208 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
209
210 gve_rx_add_to_block(priv, idx);
211 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
212 }
213
gve_rx_alloc_ring_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg,struct gve_rx_ring * rx,int idx)214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
215 struct gve_rx_alloc_rings_cfg *cfg,
216 struct gve_rx_ring *rx,
217 int idx)
218 {
219 struct device *hdev = &priv->pdev->dev;
220 struct page_pool *pool;
221 int qpl_page_cnt;
222 size_t size;
223 u32 qpl_id;
224
225 const u32 buffer_queue_slots = cfg->ring_size;
226 const u32 completion_queue_slots = cfg->ring_size;
227
228 netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
229
230 memset(rx, 0, sizeof(*rx));
231 rx->gve = priv;
232 rx->q_num = idx;
233 rx->packet_buffer_size = cfg->packet_buffer_size;
234
235 if (cfg->xdp) {
236 rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO;
237 rx->rx_headroom = XDP_PACKET_HEADROOM;
238 } else {
239 rx->packet_buffer_truesize = rx->packet_buffer_size;
240 rx->rx_headroom = 0;
241 }
242
243 /* struct gve_xdp_buff is overlaid on struct xdp_buff_xsk and utilizes
244 * the 24 byte field cb to store gve specific data.
245 */
246 XSK_CHECK_PRIV_TYPE(struct gve_xdp_buff);
247
248 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
249 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
250 rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
251 sizeof(rx->dqo.buf_states[0]),
252 GFP_KERNEL, priv->numa_node);
253 if (!rx->dqo.buf_states)
254 return -ENOMEM;
255
256 /* Allocate header buffers for header-split */
257 if (cfg->enable_header_split)
258 if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
259 goto err;
260
261 /* Allocate RX completion queue */
262 size = sizeof(rx->dqo.complq.desc_ring[0]) *
263 completion_queue_slots;
264 rx->dqo.complq.desc_ring =
265 dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
266 if (!rx->dqo.complq.desc_ring)
267 goto err;
268
269 /* Allocate RX buffer queue */
270 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
271 rx->dqo.bufq.desc_ring =
272 dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
273 if (!rx->dqo.bufq.desc_ring)
274 goto err;
275
276 if (cfg->raw_addressing) {
277 pool = gve_rx_create_page_pool(priv, rx, cfg->xdp);
278 if (IS_ERR(pool))
279 goto err;
280
281 rx->dqo.page_pool = pool;
282 } else {
283 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
284 qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
285
286 rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
287 qpl_page_cnt);
288 if (!rx->dqo.qpl)
289 goto err;
290 rx->dqo.next_qpl_page_idx = 0;
291 }
292
293 rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
294 &rx->q_resources_bus, GFP_KERNEL);
295 if (!rx->q_resources)
296 goto err;
297
298 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
299 completion_queue_slots);
300
301 return 0;
302
303 err:
304 gve_rx_free_ring_dqo(priv, rx, cfg);
305 return -ENOMEM;
306 }
307
gve_rx_write_doorbell_dqo(const struct gve_priv * priv,int queue_idx)308 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
309 {
310 const struct gve_rx_ring *rx = &priv->rx[queue_idx];
311 u64 index = be32_to_cpu(rx->q_resources->db_index);
312
313 iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
314 }
315
gve_rx_alloc_rings_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)316 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
317 struct gve_rx_alloc_rings_cfg *cfg)
318 {
319 struct gve_rx_ring *rx;
320 int err;
321 int i;
322
323 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
324 GFP_KERNEL);
325 if (!rx)
326 return -ENOMEM;
327
328 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
329 err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
330 if (err) {
331 netif_err(priv, drv, priv->dev,
332 "Failed to alloc rx ring=%d: err=%d\n",
333 i, err);
334 goto err;
335 }
336 }
337
338 cfg->rx = rx;
339 return 0;
340
341 err:
342 for (i--; i >= 0; i--)
343 gve_rx_free_ring_dqo(priv, &rx[i], cfg);
344 kvfree(rx);
345 return err;
346 }
347
gve_rx_free_rings_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)348 void gve_rx_free_rings_dqo(struct gve_priv *priv,
349 struct gve_rx_alloc_rings_cfg *cfg)
350 {
351 struct gve_rx_ring *rx = cfg->rx;
352 int i;
353
354 if (!rx)
355 return;
356
357 for (i = 0; i < cfg->qcfg_rx->num_queues; i++)
358 gve_rx_free_ring_dqo(priv, &rx[i], cfg);
359
360 kvfree(rx);
361 cfg->rx = NULL;
362 }
363
gve_rx_post_buffers_dqo(struct gve_rx_ring * rx)364 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
365 {
366 struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
367 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
368 struct gve_priv *priv = rx->gve;
369 u32 num_avail_slots;
370 u32 num_full_slots;
371 u32 num_posted = 0;
372
373 num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
374 num_avail_slots = bufq->mask - num_full_slots;
375
376 num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
377 while (num_posted < num_avail_slots) {
378 struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
379
380 if (unlikely(gve_alloc_buffer(rx, desc))) {
381 u64_stats_update_begin(&rx->statss);
382 rx->rx_buf_alloc_fail++;
383 u64_stats_update_end(&rx->statss);
384 break;
385 }
386
387 if (rx->dqo.hdr_bufs.data)
388 desc->header_buf_addr =
389 cpu_to_le64(rx->dqo.hdr_bufs.addr +
390 priv->header_buf_size * bufq->tail);
391
392 bufq->tail = (bufq->tail + 1) & bufq->mask;
393 complq->num_free_slots--;
394 num_posted++;
395
396 if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
397 gve_rx_write_doorbell_dqo(priv, rx->q_num);
398 }
399
400 rx->fill_cnt += num_posted;
401 }
402
gve_rx_skb_csum(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * desc,struct gve_ptype ptype)403 static void gve_rx_skb_csum(struct sk_buff *skb,
404 const struct gve_rx_compl_desc_dqo *desc,
405 struct gve_ptype ptype)
406 {
407 skb->ip_summed = CHECKSUM_NONE;
408
409 /* HW did not identify and process L3 and L4 headers. */
410 if (unlikely(!desc->l3_l4_processed))
411 return;
412
413 if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
414 if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
415 return;
416 } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
417 /* Checksum should be skipped if this flag is set. */
418 if (unlikely(desc->ipv6_ex_add))
419 return;
420 }
421
422 if (unlikely(desc->csum_l4_err))
423 return;
424
425 switch (ptype.l4_type) {
426 case GVE_L4_TYPE_TCP:
427 case GVE_L4_TYPE_UDP:
428 case GVE_L4_TYPE_ICMP:
429 case GVE_L4_TYPE_SCTP:
430 skb->ip_summed = CHECKSUM_UNNECESSARY;
431 break;
432 default:
433 break;
434 }
435 }
436
gve_rx_skb_hash(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * compl_desc,struct gve_ptype ptype)437 static void gve_rx_skb_hash(struct sk_buff *skb,
438 const struct gve_rx_compl_desc_dqo *compl_desc,
439 struct gve_ptype ptype)
440 {
441 enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
442
443 if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
444 hash_type = PKT_HASH_TYPE_L4;
445 else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
446 hash_type = PKT_HASH_TYPE_L3;
447
448 skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
449 }
450
451 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the
452 * skb.
453 *
454 * This algorithm works by using the passed hardware timestamp to generate a
455 * diff relative to the last read of the nic clock. This diff can be positive or
456 * negative, as it is possible that we have read the clock more recently than
457 * the hardware has received this packet. To detect this, we use the high bit of
458 * the diff, and assume that the read is more recent if the high bit is set. In
459 * this case we invert the process.
460 *
461 * Note that this means if the time delta between packet reception and the last
462 * clock read is greater than ~2 seconds, this will provide invalid results.
463 */
gve_rx_get_hwtstamp(struct gve_priv * gve,u32 hwts)464 static ktime_t gve_rx_get_hwtstamp(struct gve_priv *gve, u32 hwts)
465 {
466 u64 last_read = READ_ONCE(gve->last_sync_nic_counter);
467 u32 low = (u32)last_read;
468 s32 diff = hwts - low;
469
470 return ns_to_ktime(last_read + diff);
471 }
472
gve_rx_skb_hwtstamp(struct gve_rx_ring * rx,const struct gve_rx_compl_desc_dqo * desc)473 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx,
474 const struct gve_rx_compl_desc_dqo *desc)
475 {
476 struct sk_buff *skb = rx->ctx.skb_head;
477
478 if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID)
479 skb_hwtstamps(skb)->hwtstamp =
480 gve_rx_get_hwtstamp(rx->gve, le32_to_cpu(desc->ts));
481 }
482
gve_xdp_rx_timestamp(const struct xdp_md * _ctx,u64 * timestamp)483 int gve_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp)
484 {
485 const struct gve_xdp_buff *ctx = (void *)_ctx;
486
487 if (!ctx->gve->nic_ts_report)
488 return -ENODATA;
489
490 if (!(ctx->compl_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID))
491 return -ENODATA;
492
493 *timestamp = gve_rx_get_hwtstamp(ctx->gve,
494 le32_to_cpu(ctx->compl_desc->ts));
495 return 0;
496 }
497
gve_rx_free_skb(struct napi_struct * napi,struct gve_rx_ring * rx)498 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
499 {
500 if (!rx->ctx.skb_head)
501 return;
502
503 if (rx->ctx.skb_head == napi->skb)
504 napi->skb = NULL;
505 dev_kfree_skb_any(rx->ctx.skb_head);
506 rx->ctx.skb_head = NULL;
507 rx->ctx.skb_tail = NULL;
508 }
509
gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring * rx)510 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
511 {
512 if (!rx->dqo.qpl)
513 return false;
514 if (rx->dqo.used_buf_states_cnt <
515 (rx->dqo.num_buf_states -
516 GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
517 return false;
518 return true;
519 }
520
gve_rx_copy_ondemand(struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len)521 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
522 struct gve_rx_buf_state_dqo *buf_state,
523 u16 buf_len)
524 {
525 struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
526 int num_frags;
527
528 if (!page)
529 return -ENOMEM;
530
531 memcpy(page_address(page),
532 buf_state->page_info.page_address +
533 buf_state->page_info.page_offset,
534 buf_len);
535 num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
536 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
537 0, buf_len, PAGE_SIZE);
538
539 u64_stats_update_begin(&rx->statss);
540 rx->rx_frag_alloc_cnt++;
541 u64_stats_update_end(&rx->statss);
542 /* Return unused buffer. */
543 gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
544 return 0;
545 }
546
gve_skb_add_rx_frag(struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,int num_frags,u16 buf_len)547 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx,
548 struct gve_rx_buf_state_dqo *buf_state,
549 int num_frags, u16 buf_len)
550 {
551 if (rx->dqo.page_pool) {
552 skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags,
553 buf_state->page_info.netmem,
554 buf_state->page_info.page_offset +
555 buf_state->page_info.pad, buf_len,
556 buf_state->page_info.buf_size);
557 } else {
558 skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
559 buf_state->page_info.page,
560 buf_state->page_info.page_offset +
561 buf_state->page_info.pad, buf_len,
562 buf_state->page_info.buf_size);
563 }
564 }
565
566 /* Chains multi skbs for single rx packet.
567 * Returns 0 if buffer is appended, -1 otherwise.
568 */
gve_rx_append_frags(struct napi_struct * napi,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len,struct gve_rx_ring * rx,struct gve_priv * priv)569 static int gve_rx_append_frags(struct napi_struct *napi,
570 struct gve_rx_buf_state_dqo *buf_state,
571 u16 buf_len, struct gve_rx_ring *rx,
572 struct gve_priv *priv)
573 {
574 int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
575
576 if (unlikely(num_frags == MAX_SKB_FRAGS)) {
577 struct sk_buff *skb;
578
579 skb = napi_alloc_skb(napi, 0);
580 if (!skb)
581 return -1;
582
583 if (rx->dqo.page_pool)
584 skb_mark_for_recycle(skb);
585
586 if (rx->ctx.skb_tail == rx->ctx.skb_head)
587 skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
588 else
589 rx->ctx.skb_tail->next = skb;
590 rx->ctx.skb_tail = skb;
591 num_frags = 0;
592 }
593 if (rx->ctx.skb_tail != rx->ctx.skb_head) {
594 rx->ctx.skb_head->len += buf_len;
595 rx->ctx.skb_head->data_len += buf_len;
596 rx->ctx.skb_head->truesize += buf_state->page_info.buf_size;
597 }
598
599 /* Trigger ondemand page allocation if we are running low on buffers */
600 if (gve_rx_should_trigger_copy_ondemand(rx))
601 return gve_rx_copy_ondemand(rx, buf_state, buf_len);
602
603 gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len);
604 gve_reuse_buffer(rx, buf_state);
605 return 0;
606 }
607
gve_xdp_tx_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp)608 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
609 struct xdp_buff *xdp)
610 {
611 struct gve_tx_ring *tx;
612 struct xdp_frame *xdpf;
613 u32 tx_qid;
614 int err;
615
616 xdpf = xdp_convert_buff_to_frame(xdp);
617 if (unlikely(!xdpf)) {
618 if (rx->xsk_pool)
619 xsk_buff_free(xdp);
620 return -ENOSPC;
621 }
622
623 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
624 tx = &priv->tx[tx_qid];
625 spin_lock(&tx->dqo_tx.xdp_lock);
626 err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
627 spin_unlock(&tx->dqo_tx.xdp_lock);
628
629 return err;
630 }
631
gve_xsk_done_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act)632 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
633 struct xdp_buff *xdp, struct bpf_prog *xprog,
634 int xdp_act)
635 {
636 switch (xdp_act) {
637 case XDP_ABORTED:
638 case XDP_DROP:
639 default:
640 xsk_buff_free(xdp);
641 break;
642 case XDP_TX:
643 if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp)))
644 goto err;
645 break;
646 case XDP_REDIRECT:
647 if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog)))
648 goto err;
649 break;
650 }
651
652 u64_stats_update_begin(&rx->statss);
653 if ((u32)xdp_act < GVE_XDP_ACTIONS)
654 rx->xdp_actions[xdp_act]++;
655 u64_stats_update_end(&rx->statss);
656 return;
657
658 err:
659 u64_stats_update_begin(&rx->statss);
660 if (xdp_act == XDP_TX)
661 rx->xdp_tx_errors++;
662 if (xdp_act == XDP_REDIRECT)
663 rx->xdp_redirect_errors++;
664 u64_stats_update_end(&rx->statss);
665 }
666
gve_xdp_done_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act,struct gve_rx_buf_state_dqo * buf_state)667 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
668 struct xdp_buff *xdp, struct bpf_prog *xprog,
669 int xdp_act,
670 struct gve_rx_buf_state_dqo *buf_state)
671 {
672 int err;
673 switch (xdp_act) {
674 case XDP_ABORTED:
675 case XDP_DROP:
676 default:
677 gve_free_buffer(rx, buf_state);
678 break;
679 case XDP_TX:
680 err = gve_xdp_tx_dqo(priv, rx, xdp);
681 if (unlikely(err))
682 goto err;
683 gve_reuse_buffer(rx, buf_state);
684 break;
685 case XDP_REDIRECT:
686 err = xdp_do_redirect(priv->dev, xdp, xprog);
687 if (unlikely(err))
688 goto err;
689 gve_reuse_buffer(rx, buf_state);
690 break;
691 }
692 u64_stats_update_begin(&rx->statss);
693 if ((u32)xdp_act < GVE_XDP_ACTIONS)
694 rx->xdp_actions[xdp_act]++;
695 u64_stats_update_end(&rx->statss);
696 return;
697 err:
698 u64_stats_update_begin(&rx->statss);
699 if (xdp_act == XDP_TX)
700 rx->xdp_tx_errors++;
701 else if (xdp_act == XDP_REDIRECT)
702 rx->xdp_redirect_errors++;
703 u64_stats_update_end(&rx->statss);
704 gve_free_buffer(rx, buf_state);
705 return;
706 }
707
gve_rx_xsk_dqo(struct napi_struct * napi,struct gve_rx_ring * rx,const struct gve_rx_compl_desc_dqo * compl_desc,struct gve_rx_buf_state_dqo * buf_state,struct bpf_prog * xprog)708 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
709 const struct gve_rx_compl_desc_dqo *compl_desc,
710 struct gve_rx_buf_state_dqo *buf_state,
711 struct bpf_prog *xprog)
712 {
713 struct xdp_buff *xdp = buf_state->xsk_buff;
714 int buf_len = compl_desc->packet_len;
715 struct gve_priv *priv = rx->gve;
716 struct gve_xdp_buff *gve_xdp;
717 int xdp_act;
718
719 xdp->data_end = xdp->data + buf_len;
720 xsk_buff_dma_sync_for_cpu(xdp);
721
722 gve_xdp = (void *)xdp;
723 gve_xdp->gve = priv;
724 gve_xdp->compl_desc = compl_desc;
725
726 if (xprog) {
727 xdp_act = bpf_prog_run_xdp(xprog, xdp);
728 buf_len = xdp->data_end - xdp->data;
729 if (xdp_act != XDP_PASS) {
730 gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act);
731 gve_free_buf_state(rx, buf_state);
732 return 0;
733 }
734 }
735
736 /* Copy the data to skb */
737 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
738 xdp->data, buf_len);
739 if (unlikely(!rx->ctx.skb_head)) {
740 xsk_buff_free(xdp);
741 gve_free_buf_state(rx, buf_state);
742 return -ENOMEM;
743 }
744 rx->ctx.skb_tail = rx->ctx.skb_head;
745
746 /* Free XSK buffer and Buffer state */
747 xsk_buff_free(xdp);
748 gve_free_buf_state(rx, buf_state);
749
750 /* Update Stats */
751 u64_stats_update_begin(&rx->statss);
752 rx->xdp_actions[XDP_PASS]++;
753 u64_stats_update_end(&rx->statss);
754 return 0;
755 }
756
gve_dma_sync(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len)757 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx,
758 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len)
759 {
760 struct gve_rx_slot_page_info *page_info = &buf_state->page_info;
761
762 if (rx->dqo.page_pool) {
763 page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool,
764 page_info->netmem,
765 page_info->page_offset,
766 buf_len);
767 } else {
768 dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
769 page_info->page_offset +
770 page_info->pad,
771 buf_len, DMA_FROM_DEVICE);
772 }
773 }
774
775 /* Returns 0 if descriptor is completed successfully.
776 * Returns -EINVAL if descriptor is invalid.
777 * Returns -ENOMEM if data cannot be copied to skb.
778 */
gve_rx_dqo(struct napi_struct * napi,struct gve_rx_ring * rx,const struct gve_rx_compl_desc_dqo * compl_desc,u32 desc_idx,int queue_idx)779 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
780 const struct gve_rx_compl_desc_dqo *compl_desc,
781 u32 desc_idx, int queue_idx)
782 {
783 const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
784 const bool hbo = compl_desc->header_buffer_overflow;
785 const bool eop = compl_desc->end_of_packet != 0;
786 const bool hsplit = compl_desc->split_header;
787 struct gve_rx_buf_state_dqo *buf_state;
788 struct gve_priv *priv = rx->gve;
789 struct bpf_prog *xprog;
790 u16 buf_len;
791 u16 hdr_len;
792
793 if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
794 net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
795 priv->dev->name, buffer_id);
796 return -EINVAL;
797 }
798 buf_state = &rx->dqo.buf_states[buffer_id];
799 if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
800 net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
801 priv->dev->name, buffer_id);
802 return -EINVAL;
803 }
804
805 if (unlikely(compl_desc->rx_error)) {
806 gve_free_buffer(rx, buf_state);
807 return -EINVAL;
808 }
809
810 buf_len = compl_desc->packet_len;
811 hdr_len = compl_desc->header_len;
812
813 xprog = READ_ONCE(priv->xdp_prog);
814 if (buf_state->xsk_buff)
815 return gve_rx_xsk_dqo(napi, rx, compl_desc, buf_state, xprog);
816
817 /* Page might have not been used for awhile and was likely last written
818 * by a different thread.
819 */
820 if (rx->dqo.page_pool) {
821 if (!netmem_is_net_iov(buf_state->page_info.netmem))
822 prefetch(netmem_to_page(buf_state->page_info.netmem));
823 } else {
824 prefetch(buf_state->page_info.page);
825 }
826
827 /* Copy the header into the skb in the case of header split */
828 if (hsplit) {
829 int unsplit = 0;
830
831 if (hdr_len && !hbo) {
832 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
833 rx->dqo.hdr_bufs.data +
834 desc_idx * priv->header_buf_size,
835 hdr_len);
836 if (unlikely(!rx->ctx.skb_head))
837 goto error;
838 rx->ctx.skb_tail = rx->ctx.skb_head;
839
840 if (rx->dqo.page_pool)
841 skb_mark_for_recycle(rx->ctx.skb_head);
842 } else {
843 unsplit = 1;
844 }
845 u64_stats_update_begin(&rx->statss);
846 rx->rx_hsplit_pkt++;
847 rx->rx_hsplit_unsplit_pkt += unsplit;
848 rx->rx_hsplit_bytes += hdr_len;
849 u64_stats_update_end(&rx->statss);
850 } else if (!rx->ctx.skb_head && rx->dqo.page_pool &&
851 netmem_is_net_iov(buf_state->page_info.netmem)) {
852 /* when header split is disabled, the header went to the packet
853 * buffer. If the packet buffer is a net_iov, those can't be
854 * easily mapped into the kernel space to access the header
855 * required to process the packet.
856 */
857 goto error;
858 }
859
860 /* Sync the portion of dma buffer for CPU to read. */
861 gve_dma_sync(priv, rx, buf_state, buf_len);
862
863 /* Append to current skb if one exists. */
864 if (rx->ctx.skb_head) {
865 if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
866 priv)) != 0) {
867 goto error;
868 }
869 return 0;
870 }
871
872 if (xprog) {
873 struct gve_xdp_buff gve_xdp;
874 void *old_data;
875 int xdp_act;
876
877 xdp_init_buff(&gve_xdp.xdp, buf_state->page_info.buf_size,
878 &rx->xdp_rxq);
879 xdp_prepare_buff(&gve_xdp.xdp,
880 buf_state->page_info.page_address +
881 buf_state->page_info.page_offset,
882 buf_state->page_info.pad,
883 buf_len, false);
884 gve_xdp.gve = priv;
885 gve_xdp.compl_desc = compl_desc;
886
887 old_data = gve_xdp.xdp.data;
888 xdp_act = bpf_prog_run_xdp(xprog, &gve_xdp.xdp);
889 buf_state->page_info.pad += gve_xdp.xdp.data - old_data;
890 buf_len = gve_xdp.xdp.data_end - gve_xdp.xdp.data;
891 if (xdp_act != XDP_PASS) {
892 gve_xdp_done_dqo(priv, rx, &gve_xdp.xdp, xprog, xdp_act,
893 buf_state);
894 return 0;
895 }
896
897 u64_stats_update_begin(&rx->statss);
898 rx->xdp_actions[XDP_PASS]++;
899 u64_stats_update_end(&rx->statss);
900 }
901
902 if (eop && buf_len <= priv->rx_copybreak &&
903 !(rx->dqo.page_pool &&
904 netmem_is_net_iov(buf_state->page_info.netmem))) {
905 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
906 &buf_state->page_info, buf_len);
907 if (unlikely(!rx->ctx.skb_head))
908 goto error;
909 rx->ctx.skb_tail = rx->ctx.skb_head;
910
911 u64_stats_update_begin(&rx->statss);
912 rx->rx_copied_pkt++;
913 rx->rx_copybreak_pkt++;
914 u64_stats_update_end(&rx->statss);
915
916 gve_free_buffer(rx, buf_state);
917 return 0;
918 }
919
920 rx->ctx.skb_head = napi_get_frags(napi);
921 if (unlikely(!rx->ctx.skb_head))
922 goto error;
923 rx->ctx.skb_tail = rx->ctx.skb_head;
924
925 if (gve_rx_should_trigger_copy_ondemand(rx)) {
926 if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
927 goto error;
928 return 0;
929 }
930
931 if (rx->dqo.page_pool)
932 skb_mark_for_recycle(rx->ctx.skb_head);
933
934 gve_skb_add_rx_frag(rx, buf_state, 0, buf_len);
935 gve_reuse_buffer(rx, buf_state);
936 return 0;
937
938 error:
939 gve_free_buffer(rx, buf_state);
940 return -ENOMEM;
941 }
942
gve_rx_complete_rsc(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * desc,struct gve_ptype ptype)943 static int gve_rx_complete_rsc(struct sk_buff *skb,
944 const struct gve_rx_compl_desc_dqo *desc,
945 struct gve_ptype ptype)
946 {
947 struct skb_shared_info *shinfo = skb_shinfo(skb);
948
949 /* Only TCP is supported right now. */
950 if (ptype.l4_type != GVE_L4_TYPE_TCP)
951 return -EINVAL;
952
953 switch (ptype.l3_type) {
954 case GVE_L3_TYPE_IPV4:
955 shinfo->gso_type = SKB_GSO_TCPV4;
956 break;
957 case GVE_L3_TYPE_IPV6:
958 shinfo->gso_type = SKB_GSO_TCPV6;
959 break;
960 default:
961 return -EINVAL;
962 }
963
964 shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
965 return 0;
966 }
967
968 /* Returns 0 if skb is completed successfully, -1 otherwise. */
gve_rx_complete_skb(struct gve_rx_ring * rx,struct napi_struct * napi,const struct gve_rx_compl_desc_dqo * desc,netdev_features_t feat)969 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
970 const struct gve_rx_compl_desc_dqo *desc,
971 netdev_features_t feat)
972 {
973 struct gve_ptype ptype =
974 rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
975 int err;
976
977 skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
978
979 if (feat & NETIF_F_RXHASH)
980 gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
981
982 if (feat & NETIF_F_RXCSUM)
983 gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
984
985 if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
986 gve_rx_skb_hwtstamp(rx, desc);
987
988 /* RSC packets must set gso_size otherwise the TCP stack will complain
989 * that packets are larger than MTU.
990 */
991 if (desc->rsc) {
992 err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
993 if (err < 0)
994 return err;
995 }
996
997 if (skb_headlen(rx->ctx.skb_head) == 0)
998 napi_gro_frags(napi);
999 else
1000 napi_gro_receive(napi, rx->ctx.skb_head);
1001
1002 return 0;
1003 }
1004
gve_rx_poll_dqo(struct gve_notify_block * block,int budget)1005 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
1006 {
1007 struct gve_rx_compl_queue_dqo *complq;
1008 struct napi_struct *napi;
1009 netdev_features_t feat;
1010 struct gve_rx_ring *rx;
1011 struct gve_priv *priv;
1012 u64 xdp_redirects;
1013 u32 work_done = 0;
1014 u64 bytes = 0;
1015 u64 xdp_txs;
1016 int err;
1017
1018 napi = &block->napi;
1019 feat = napi->dev->features;
1020
1021 rx = block->rx;
1022 priv = rx->gve;
1023 complq = &rx->dqo.complq;
1024
1025 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
1026 xdp_txs = rx->xdp_actions[XDP_TX];
1027
1028 while (work_done < budget) {
1029 struct gve_rx_compl_desc_dqo *compl_desc =
1030 &complq->desc_ring[complq->head];
1031 u32 pkt_bytes;
1032
1033 /* No more new packets */
1034 if (compl_desc->generation == complq->cur_gen_bit)
1035 break;
1036
1037 /* Prefetch the next two descriptors. */
1038 prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
1039 prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
1040
1041 /* Do not read data until we own the descriptor */
1042 dma_rmb();
1043
1044 err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
1045 if (err < 0) {
1046 gve_rx_free_skb(napi, rx);
1047 u64_stats_update_begin(&rx->statss);
1048 if (err == -ENOMEM)
1049 rx->rx_skb_alloc_fail++;
1050 else if (err == -EINVAL)
1051 rx->rx_desc_err_dropped_pkt++;
1052 u64_stats_update_end(&rx->statss);
1053 }
1054
1055 complq->head = (complq->head + 1) & complq->mask;
1056 complq->num_free_slots++;
1057
1058 /* When the ring wraps, the generation bit is flipped. */
1059 complq->cur_gen_bit ^= (complq->head == 0);
1060
1061 /* Receiving a completion means we have space to post another
1062 * buffer on the buffer queue.
1063 */
1064 {
1065 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
1066
1067 bufq->head = (bufq->head + 1) & bufq->mask;
1068 }
1069
1070 /* Free running counter of completed descriptors */
1071 rx->cnt++;
1072
1073 if (!rx->ctx.skb_head)
1074 continue;
1075
1076 if (!compl_desc->end_of_packet)
1077 continue;
1078
1079 work_done++;
1080 pkt_bytes = rx->ctx.skb_head->len;
1081 /* The ethernet header (first ETH_HLEN bytes) is snipped off
1082 * by eth_type_trans.
1083 */
1084 if (skb_headlen(rx->ctx.skb_head))
1085 pkt_bytes += ETH_HLEN;
1086
1087 /* gve_rx_complete_skb() will consume skb if successful */
1088 if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
1089 gve_rx_free_skb(napi, rx);
1090 u64_stats_update_begin(&rx->statss);
1091 rx->rx_desc_err_dropped_pkt++;
1092 u64_stats_update_end(&rx->statss);
1093 continue;
1094 }
1095
1096 bytes += pkt_bytes;
1097 rx->ctx.skb_head = NULL;
1098 rx->ctx.skb_tail = NULL;
1099 }
1100
1101 if (xdp_txs != rx->xdp_actions[XDP_TX])
1102 gve_xdp_tx_flush_dqo(priv, rx->q_num);
1103
1104 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1105 xdp_do_flush();
1106
1107 gve_rx_post_buffers_dqo(rx);
1108
1109 u64_stats_update_begin(&rx->statss);
1110 rx->rpackets += work_done;
1111 rx->rbytes += bytes;
1112 u64_stats_update_end(&rx->statss);
1113
1114 return work_done;
1115 }
1116