1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/bpf.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/skbuff.h>
15 #include <linux/slab.h>
16 #include <net/ip6_checksum.h>
17 #include <net/ipv6.h>
18 #include <net/tcp.h>
19 #include <net/xdp_sock_drv.h>
20
gve_rx_free_hdr_bufs(struct gve_priv * priv,struct gve_rx_ring * rx)21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
22 {
23 struct device *hdev = &priv->pdev->dev;
24 int buf_count = rx->dqo.bufq.mask + 1;
25
26 if (rx->dqo.hdr_bufs.data) {
27 dma_free_coherent(hdev, priv->header_buf_size * buf_count,
28 rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
29 rx->dqo.hdr_bufs.data = NULL;
30 }
31 }
32
gve_rx_init_ring_state_dqo(struct gve_rx_ring * rx,const u32 buffer_queue_slots,const u32 completion_queue_slots)33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
34 const u32 buffer_queue_slots,
35 const u32 completion_queue_slots)
36 {
37 int i;
38
39 /* Set buffer queue state */
40 rx->dqo.bufq.mask = buffer_queue_slots - 1;
41 rx->dqo.bufq.head = 0;
42 rx->dqo.bufq.tail = 0;
43
44 /* Set completion queue state */
45 rx->dqo.complq.num_free_slots = completion_queue_slots;
46 rx->dqo.complq.mask = completion_queue_slots - 1;
47 rx->dqo.complq.cur_gen_bit = 0;
48 rx->dqo.complq.head = 0;
49
50 /* Set RX SKB context */
51 rx->ctx.skb_head = NULL;
52 rx->ctx.skb_tail = NULL;
53
54 /* Set up linked list of buffer IDs */
55 if (rx->dqo.buf_states) {
56 for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
57 rx->dqo.buf_states[i].next = i + 1;
58 rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
59 }
60
61 rx->dqo.free_buf_states = 0;
62 rx->dqo.recycled_buf_states.head = -1;
63 rx->dqo.recycled_buf_states.tail = -1;
64 rx->dqo.used_buf_states.head = -1;
65 rx->dqo.used_buf_states.tail = -1;
66 }
67
gve_rx_reset_ring_dqo(struct gve_priv * priv,int idx)68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
69 {
70 struct gve_rx_ring *rx = &priv->rx[idx];
71 size_t size;
72 int i;
73
74 const u32 buffer_queue_slots = priv->rx_desc_cnt;
75 const u32 completion_queue_slots = priv->rx_desc_cnt;
76
77 /* Reset buffer queue */
78 if (rx->dqo.bufq.desc_ring) {
79 size = sizeof(rx->dqo.bufq.desc_ring[0]) *
80 buffer_queue_slots;
81 memset(rx->dqo.bufq.desc_ring, 0, size);
82 }
83
84 /* Reset completion queue */
85 if (rx->dqo.complq.desc_ring) {
86 size = sizeof(rx->dqo.complq.desc_ring[0]) *
87 completion_queue_slots;
88 memset(rx->dqo.complq.desc_ring, 0, size);
89 }
90
91 /* Reset q_resources */
92 if (rx->q_resources)
93 memset(rx->q_resources, 0, sizeof(*rx->q_resources));
94
95 /* Reset buf states */
96 if (rx->dqo.buf_states) {
97 for (i = 0; i < rx->dqo.num_buf_states; i++) {
98 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
99
100 if (rx->dqo.page_pool)
101 gve_free_to_page_pool(rx, bs, false);
102 else
103 gve_free_qpl_page_dqo(bs);
104 }
105 }
106
107 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
108 completion_queue_slots);
109 }
110
gve_rx_stop_ring_dqo(struct gve_priv * priv,int idx)111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
112 {
113 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
114 struct gve_rx_ring *rx = &priv->rx[idx];
115
116 if (!gve_rx_was_added_to_block(priv, idx))
117 return;
118
119 if (rx->dqo.page_pool)
120 page_pool_disable_direct_recycling(rx->dqo.page_pool);
121 gve_remove_napi(priv, ntfy_idx);
122 gve_rx_remove_from_block(priv, idx);
123 gve_rx_reset_ring_dqo(priv, idx);
124 }
125
gve_rx_free_ring_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_alloc_rings_cfg * cfg)126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
127 struct gve_rx_alloc_rings_cfg *cfg)
128 {
129 struct device *hdev = &priv->pdev->dev;
130 size_t completion_queue_slots;
131 size_t buffer_queue_slots;
132 int idx = rx->q_num;
133 size_t size;
134 u32 qpl_id;
135 int i;
136
137 completion_queue_slots = rx->dqo.complq.mask + 1;
138 buffer_queue_slots = rx->dqo.bufq.mask + 1;
139
140 if (rx->q_resources) {
141 dma_free_coherent(hdev, sizeof(*rx->q_resources),
142 rx->q_resources, rx->q_resources_bus);
143 rx->q_resources = NULL;
144 }
145
146 for (i = 0; i < rx->dqo.num_buf_states; i++) {
147 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
148
149 if (rx->dqo.page_pool)
150 gve_free_to_page_pool(rx, bs, false);
151 else
152 gve_free_qpl_page_dqo(bs);
153 if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) {
154 xsk_buff_free(bs->xsk_buff);
155 bs->xsk_buff = NULL;
156 }
157 }
158
159 if (rx->dqo.qpl) {
160 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
161 gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
162 rx->dqo.qpl = NULL;
163 }
164
165 if (rx->dqo.bufq.desc_ring) {
166 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
167 dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
168 rx->dqo.bufq.bus);
169 rx->dqo.bufq.desc_ring = NULL;
170 }
171
172 if (rx->dqo.complq.desc_ring) {
173 size = sizeof(rx->dqo.complq.desc_ring[0]) *
174 completion_queue_slots;
175 dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
176 rx->dqo.complq.bus);
177 rx->dqo.complq.desc_ring = NULL;
178 }
179
180 kvfree(rx->dqo.buf_states);
181 rx->dqo.buf_states = NULL;
182
183 if (rx->dqo.page_pool) {
184 page_pool_destroy(rx->dqo.page_pool);
185 rx->dqo.page_pool = NULL;
186 }
187
188 gve_rx_free_hdr_bufs(priv, rx);
189
190 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
191 }
192
gve_rx_alloc_hdr_bufs(struct gve_priv * priv,struct gve_rx_ring * rx,const u32 buf_count)193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
194 const u32 buf_count)
195 {
196 struct device *hdev = &priv->pdev->dev;
197
198 rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
199 &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
200 if (!rx->dqo.hdr_bufs.data)
201 return -ENOMEM;
202
203 return 0;
204 }
205
gve_rx_start_ring_dqo(struct gve_priv * priv,int idx)206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
207 {
208 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
209
210 gve_rx_add_to_block(priv, idx);
211 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
212 }
213
gve_rx_alloc_ring_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg,struct gve_rx_ring * rx,int idx)214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
215 struct gve_rx_alloc_rings_cfg *cfg,
216 struct gve_rx_ring *rx,
217 int idx)
218 {
219 struct device *hdev = &priv->pdev->dev;
220 struct page_pool *pool;
221 int qpl_page_cnt;
222 size_t size;
223 u32 qpl_id;
224
225 const u32 buffer_queue_slots = cfg->ring_size;
226 const u32 completion_queue_slots = cfg->ring_size;
227
228 netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
229
230 memset(rx, 0, sizeof(*rx));
231 rx->gve = priv;
232 rx->q_num = idx;
233 rx->packet_buffer_size = cfg->packet_buffer_size;
234
235 if (cfg->xdp) {
236 rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO;
237 rx->rx_headroom = XDP_PACKET_HEADROOM;
238 } else {
239 rx->packet_buffer_truesize = rx->packet_buffer_size;
240 rx->rx_headroom = 0;
241 }
242
243 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
244 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
245 rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
246 sizeof(rx->dqo.buf_states[0]),
247 GFP_KERNEL, priv->numa_node);
248 if (!rx->dqo.buf_states)
249 return -ENOMEM;
250
251 /* Allocate header buffers for header-split */
252 if (cfg->enable_header_split)
253 if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
254 goto err;
255
256 /* Allocate RX completion queue */
257 size = sizeof(rx->dqo.complq.desc_ring[0]) *
258 completion_queue_slots;
259 rx->dqo.complq.desc_ring =
260 dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
261 if (!rx->dqo.complq.desc_ring)
262 goto err;
263
264 /* Allocate RX buffer queue */
265 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
266 rx->dqo.bufq.desc_ring =
267 dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
268 if (!rx->dqo.bufq.desc_ring)
269 goto err;
270
271 if (cfg->raw_addressing) {
272 pool = gve_rx_create_page_pool(priv, rx, cfg->xdp);
273 if (IS_ERR(pool))
274 goto err;
275
276 rx->dqo.page_pool = pool;
277 } else {
278 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
279 qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
280
281 rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
282 qpl_page_cnt);
283 if (!rx->dqo.qpl)
284 goto err;
285 rx->dqo.next_qpl_page_idx = 0;
286 }
287
288 rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
289 &rx->q_resources_bus, GFP_KERNEL);
290 if (!rx->q_resources)
291 goto err;
292
293 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
294 completion_queue_slots);
295
296 return 0;
297
298 err:
299 gve_rx_free_ring_dqo(priv, rx, cfg);
300 return -ENOMEM;
301 }
302
gve_rx_write_doorbell_dqo(const struct gve_priv * priv,int queue_idx)303 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
304 {
305 const struct gve_rx_ring *rx = &priv->rx[queue_idx];
306 u64 index = be32_to_cpu(rx->q_resources->db_index);
307
308 iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
309 }
310
gve_rx_alloc_rings_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)311 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
312 struct gve_rx_alloc_rings_cfg *cfg)
313 {
314 struct gve_rx_ring *rx;
315 int err;
316 int i;
317
318 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
319 GFP_KERNEL);
320 if (!rx)
321 return -ENOMEM;
322
323 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
324 err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
325 if (err) {
326 netif_err(priv, drv, priv->dev,
327 "Failed to alloc rx ring=%d: err=%d\n",
328 i, err);
329 goto err;
330 }
331 }
332
333 cfg->rx = rx;
334 return 0;
335
336 err:
337 for (i--; i >= 0; i--)
338 gve_rx_free_ring_dqo(priv, &rx[i], cfg);
339 kvfree(rx);
340 return err;
341 }
342
gve_rx_free_rings_dqo(struct gve_priv * priv,struct gve_rx_alloc_rings_cfg * cfg)343 void gve_rx_free_rings_dqo(struct gve_priv *priv,
344 struct gve_rx_alloc_rings_cfg *cfg)
345 {
346 struct gve_rx_ring *rx = cfg->rx;
347 int i;
348
349 if (!rx)
350 return;
351
352 for (i = 0; i < cfg->qcfg_rx->num_queues; i++)
353 gve_rx_free_ring_dqo(priv, &rx[i], cfg);
354
355 kvfree(rx);
356 cfg->rx = NULL;
357 }
358
gve_rx_post_buffers_dqo(struct gve_rx_ring * rx)359 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
360 {
361 struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
362 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
363 struct gve_priv *priv = rx->gve;
364 u32 num_avail_slots;
365 u32 num_full_slots;
366 u32 num_posted = 0;
367
368 num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
369 num_avail_slots = bufq->mask - num_full_slots;
370
371 num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
372 while (num_posted < num_avail_slots) {
373 struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
374
375 if (unlikely(gve_alloc_buffer(rx, desc))) {
376 u64_stats_update_begin(&rx->statss);
377 rx->rx_buf_alloc_fail++;
378 u64_stats_update_end(&rx->statss);
379 break;
380 }
381
382 if (rx->dqo.hdr_bufs.data)
383 desc->header_buf_addr =
384 cpu_to_le64(rx->dqo.hdr_bufs.addr +
385 priv->header_buf_size * bufq->tail);
386
387 bufq->tail = (bufq->tail + 1) & bufq->mask;
388 complq->num_free_slots--;
389 num_posted++;
390
391 if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
392 gve_rx_write_doorbell_dqo(priv, rx->q_num);
393 }
394
395 rx->fill_cnt += num_posted;
396 }
397
gve_rx_skb_csum(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * desc,struct gve_ptype ptype)398 static void gve_rx_skb_csum(struct sk_buff *skb,
399 const struct gve_rx_compl_desc_dqo *desc,
400 struct gve_ptype ptype)
401 {
402 skb->ip_summed = CHECKSUM_NONE;
403
404 /* HW did not identify and process L3 and L4 headers. */
405 if (unlikely(!desc->l3_l4_processed))
406 return;
407
408 if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
409 if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
410 return;
411 } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
412 /* Checksum should be skipped if this flag is set. */
413 if (unlikely(desc->ipv6_ex_add))
414 return;
415 }
416
417 if (unlikely(desc->csum_l4_err))
418 return;
419
420 switch (ptype.l4_type) {
421 case GVE_L4_TYPE_TCP:
422 case GVE_L4_TYPE_UDP:
423 case GVE_L4_TYPE_ICMP:
424 case GVE_L4_TYPE_SCTP:
425 skb->ip_summed = CHECKSUM_UNNECESSARY;
426 break;
427 default:
428 break;
429 }
430 }
431
gve_rx_skb_hash(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * compl_desc,struct gve_ptype ptype)432 static void gve_rx_skb_hash(struct sk_buff *skb,
433 const struct gve_rx_compl_desc_dqo *compl_desc,
434 struct gve_ptype ptype)
435 {
436 enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
437
438 if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
439 hash_type = PKT_HASH_TYPE_L4;
440 else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
441 hash_type = PKT_HASH_TYPE_L3;
442
443 skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
444 }
445
446 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the
447 * skb.
448 *
449 * This algorithm works by using the passed hardware timestamp to generate a
450 * diff relative to the last read of the nic clock. This diff can be positive or
451 * negative, as it is possible that we have read the clock more recently than
452 * the hardware has received this packet. To detect this, we use the high bit of
453 * the diff, and assume that the read is more recent if the high bit is set. In
454 * this case we invert the process.
455 *
456 * Note that this means if the time delta between packet reception and the last
457 * clock read is greater than ~2 seconds, this will provide invalid results.
458 */
gve_rx_skb_hwtstamp(struct gve_rx_ring * rx,const struct gve_rx_compl_desc_dqo * desc)459 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx,
460 const struct gve_rx_compl_desc_dqo *desc)
461 {
462 u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter);
463 struct sk_buff *skb = rx->ctx.skb_head;
464 u32 ts, low;
465 s32 diff;
466
467 if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) {
468 ts = le32_to_cpu(desc->ts);
469 low = (u32)last_read;
470 diff = ts - low;
471 skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff);
472 }
473 }
474
gve_rx_free_skb(struct napi_struct * napi,struct gve_rx_ring * rx)475 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
476 {
477 if (!rx->ctx.skb_head)
478 return;
479
480 if (rx->ctx.skb_head == napi->skb)
481 napi->skb = NULL;
482 dev_kfree_skb_any(rx->ctx.skb_head);
483 rx->ctx.skb_head = NULL;
484 rx->ctx.skb_tail = NULL;
485 }
486
gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring * rx)487 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
488 {
489 if (!rx->dqo.qpl)
490 return false;
491 if (rx->dqo.used_buf_states_cnt <
492 (rx->dqo.num_buf_states -
493 GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
494 return false;
495 return true;
496 }
497
gve_rx_copy_ondemand(struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len)498 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
499 struct gve_rx_buf_state_dqo *buf_state,
500 u16 buf_len)
501 {
502 struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
503 int num_frags;
504
505 if (!page)
506 return -ENOMEM;
507
508 memcpy(page_address(page),
509 buf_state->page_info.page_address +
510 buf_state->page_info.page_offset,
511 buf_len);
512 num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
513 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
514 0, buf_len, PAGE_SIZE);
515
516 u64_stats_update_begin(&rx->statss);
517 rx->rx_frag_alloc_cnt++;
518 u64_stats_update_end(&rx->statss);
519 /* Return unused buffer. */
520 gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
521 return 0;
522 }
523
gve_skb_add_rx_frag(struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,int num_frags,u16 buf_len)524 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx,
525 struct gve_rx_buf_state_dqo *buf_state,
526 int num_frags, u16 buf_len)
527 {
528 if (rx->dqo.page_pool) {
529 skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags,
530 buf_state->page_info.netmem,
531 buf_state->page_info.page_offset +
532 buf_state->page_info.pad, buf_len,
533 buf_state->page_info.buf_size);
534 } else {
535 skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
536 buf_state->page_info.page,
537 buf_state->page_info.page_offset +
538 buf_state->page_info.pad, buf_len,
539 buf_state->page_info.buf_size);
540 }
541 }
542
543 /* Chains multi skbs for single rx packet.
544 * Returns 0 if buffer is appended, -1 otherwise.
545 */
gve_rx_append_frags(struct napi_struct * napi,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len,struct gve_rx_ring * rx,struct gve_priv * priv)546 static int gve_rx_append_frags(struct napi_struct *napi,
547 struct gve_rx_buf_state_dqo *buf_state,
548 u16 buf_len, struct gve_rx_ring *rx,
549 struct gve_priv *priv)
550 {
551 int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
552
553 if (unlikely(num_frags == MAX_SKB_FRAGS)) {
554 struct sk_buff *skb;
555
556 skb = napi_alloc_skb(napi, 0);
557 if (!skb)
558 return -1;
559
560 if (rx->dqo.page_pool)
561 skb_mark_for_recycle(skb);
562
563 if (rx->ctx.skb_tail == rx->ctx.skb_head)
564 skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
565 else
566 rx->ctx.skb_tail->next = skb;
567 rx->ctx.skb_tail = skb;
568 num_frags = 0;
569 }
570 if (rx->ctx.skb_tail != rx->ctx.skb_head) {
571 rx->ctx.skb_head->len += buf_len;
572 rx->ctx.skb_head->data_len += buf_len;
573 rx->ctx.skb_head->truesize += buf_state->page_info.buf_size;
574 }
575
576 /* Trigger ondemand page allocation if we are running low on buffers */
577 if (gve_rx_should_trigger_copy_ondemand(rx))
578 return gve_rx_copy_ondemand(rx, buf_state, buf_len);
579
580 gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len);
581 gve_reuse_buffer(rx, buf_state);
582 return 0;
583 }
584
gve_xdp_tx_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp)585 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
586 struct xdp_buff *xdp)
587 {
588 struct gve_tx_ring *tx;
589 struct xdp_frame *xdpf;
590 u32 tx_qid;
591 int err;
592
593 xdpf = xdp_convert_buff_to_frame(xdp);
594 if (unlikely(!xdpf)) {
595 if (rx->xsk_pool)
596 xsk_buff_free(xdp);
597 return -ENOSPC;
598 }
599
600 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
601 tx = &priv->tx[tx_qid];
602 spin_lock(&tx->dqo_tx.xdp_lock);
603 err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
604 spin_unlock(&tx->dqo_tx.xdp_lock);
605
606 return err;
607 }
608
gve_xsk_done_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act)609 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
610 struct xdp_buff *xdp, struct bpf_prog *xprog,
611 int xdp_act)
612 {
613 switch (xdp_act) {
614 case XDP_ABORTED:
615 case XDP_DROP:
616 default:
617 xsk_buff_free(xdp);
618 break;
619 case XDP_TX:
620 if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp)))
621 goto err;
622 break;
623 case XDP_REDIRECT:
624 if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog)))
625 goto err;
626 break;
627 }
628
629 u64_stats_update_begin(&rx->statss);
630 if ((u32)xdp_act < GVE_XDP_ACTIONS)
631 rx->xdp_actions[xdp_act]++;
632 u64_stats_update_end(&rx->statss);
633 return;
634
635 err:
636 u64_stats_update_begin(&rx->statss);
637 if (xdp_act == XDP_TX)
638 rx->xdp_tx_errors++;
639 if (xdp_act == XDP_REDIRECT)
640 rx->xdp_redirect_errors++;
641 u64_stats_update_end(&rx->statss);
642 }
643
gve_xdp_done_dqo(struct gve_priv * priv,struct gve_rx_ring * rx,struct xdp_buff * xdp,struct bpf_prog * xprog,int xdp_act,struct gve_rx_buf_state_dqo * buf_state)644 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
645 struct xdp_buff *xdp, struct bpf_prog *xprog,
646 int xdp_act,
647 struct gve_rx_buf_state_dqo *buf_state)
648 {
649 int err;
650 switch (xdp_act) {
651 case XDP_ABORTED:
652 case XDP_DROP:
653 default:
654 gve_free_buffer(rx, buf_state);
655 break;
656 case XDP_TX:
657 err = gve_xdp_tx_dqo(priv, rx, xdp);
658 if (unlikely(err))
659 goto err;
660 gve_reuse_buffer(rx, buf_state);
661 break;
662 case XDP_REDIRECT:
663 err = xdp_do_redirect(priv->dev, xdp, xprog);
664 if (unlikely(err))
665 goto err;
666 gve_reuse_buffer(rx, buf_state);
667 break;
668 }
669 u64_stats_update_begin(&rx->statss);
670 if ((u32)xdp_act < GVE_XDP_ACTIONS)
671 rx->xdp_actions[xdp_act]++;
672 u64_stats_update_end(&rx->statss);
673 return;
674 err:
675 u64_stats_update_begin(&rx->statss);
676 if (xdp_act == XDP_TX)
677 rx->xdp_tx_errors++;
678 else if (xdp_act == XDP_REDIRECT)
679 rx->xdp_redirect_errors++;
680 u64_stats_update_end(&rx->statss);
681 gve_free_buffer(rx, buf_state);
682 return;
683 }
684
gve_rx_xsk_dqo(struct napi_struct * napi,struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,int buf_len,struct bpf_prog * xprog)685 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
686 struct gve_rx_buf_state_dqo *buf_state, int buf_len,
687 struct bpf_prog *xprog)
688 {
689 struct xdp_buff *xdp = buf_state->xsk_buff;
690 struct gve_priv *priv = rx->gve;
691 int xdp_act;
692
693 xdp->data_end = xdp->data + buf_len;
694 xsk_buff_dma_sync_for_cpu(xdp);
695
696 if (xprog) {
697 xdp_act = bpf_prog_run_xdp(xprog, xdp);
698 buf_len = xdp->data_end - xdp->data;
699 if (xdp_act != XDP_PASS) {
700 gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act);
701 gve_free_buf_state(rx, buf_state);
702 return 0;
703 }
704 }
705
706 /* Copy the data to skb */
707 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
708 xdp->data, buf_len);
709 if (unlikely(!rx->ctx.skb_head)) {
710 xsk_buff_free(xdp);
711 gve_free_buf_state(rx, buf_state);
712 return -ENOMEM;
713 }
714 rx->ctx.skb_tail = rx->ctx.skb_head;
715
716 /* Free XSK buffer and Buffer state */
717 xsk_buff_free(xdp);
718 gve_free_buf_state(rx, buf_state);
719
720 /* Update Stats */
721 u64_stats_update_begin(&rx->statss);
722 rx->xdp_actions[XDP_PASS]++;
723 u64_stats_update_end(&rx->statss);
724 return 0;
725 }
726
gve_dma_sync(struct gve_priv * priv,struct gve_rx_ring * rx,struct gve_rx_buf_state_dqo * buf_state,u16 buf_len)727 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx,
728 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len)
729 {
730 struct gve_rx_slot_page_info *page_info = &buf_state->page_info;
731
732 if (rx->dqo.page_pool) {
733 page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool,
734 page_info->netmem,
735 page_info->page_offset,
736 buf_len);
737 } else {
738 dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
739 page_info->page_offset +
740 page_info->pad,
741 buf_len, DMA_FROM_DEVICE);
742 }
743 }
744
745 /* Returns 0 if descriptor is completed successfully.
746 * Returns -EINVAL if descriptor is invalid.
747 * Returns -ENOMEM if data cannot be copied to skb.
748 */
gve_rx_dqo(struct napi_struct * napi,struct gve_rx_ring * rx,const struct gve_rx_compl_desc_dqo * compl_desc,u32 desc_idx,int queue_idx)749 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
750 const struct gve_rx_compl_desc_dqo *compl_desc,
751 u32 desc_idx, int queue_idx)
752 {
753 const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
754 const bool hbo = compl_desc->header_buffer_overflow;
755 const bool eop = compl_desc->end_of_packet != 0;
756 const bool hsplit = compl_desc->split_header;
757 struct gve_rx_buf_state_dqo *buf_state;
758 struct gve_priv *priv = rx->gve;
759 struct bpf_prog *xprog;
760 u16 buf_len;
761 u16 hdr_len;
762
763 if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
764 net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
765 priv->dev->name, buffer_id);
766 return -EINVAL;
767 }
768 buf_state = &rx->dqo.buf_states[buffer_id];
769 if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
770 net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
771 priv->dev->name, buffer_id);
772 return -EINVAL;
773 }
774
775 if (unlikely(compl_desc->rx_error)) {
776 gve_free_buffer(rx, buf_state);
777 return -EINVAL;
778 }
779
780 buf_len = compl_desc->packet_len;
781 hdr_len = compl_desc->header_len;
782
783 xprog = READ_ONCE(priv->xdp_prog);
784 if (buf_state->xsk_buff)
785 return gve_rx_xsk_dqo(napi, rx, buf_state, buf_len, xprog);
786
787 /* Page might have not been used for awhile and was likely last written
788 * by a different thread.
789 */
790 if (rx->dqo.page_pool) {
791 if (!netmem_is_net_iov(buf_state->page_info.netmem))
792 prefetch(netmem_to_page(buf_state->page_info.netmem));
793 } else {
794 prefetch(buf_state->page_info.page);
795 }
796
797 /* Copy the header into the skb in the case of header split */
798 if (hsplit) {
799 int unsplit = 0;
800
801 if (hdr_len && !hbo) {
802 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
803 rx->dqo.hdr_bufs.data +
804 desc_idx * priv->header_buf_size,
805 hdr_len);
806 if (unlikely(!rx->ctx.skb_head))
807 goto error;
808 rx->ctx.skb_tail = rx->ctx.skb_head;
809
810 if (rx->dqo.page_pool)
811 skb_mark_for_recycle(rx->ctx.skb_head);
812 } else {
813 unsplit = 1;
814 }
815 u64_stats_update_begin(&rx->statss);
816 rx->rx_hsplit_pkt++;
817 rx->rx_hsplit_unsplit_pkt += unsplit;
818 rx->rx_hsplit_bytes += hdr_len;
819 u64_stats_update_end(&rx->statss);
820 } else if (!rx->ctx.skb_head && rx->dqo.page_pool &&
821 netmem_is_net_iov(buf_state->page_info.netmem)) {
822 /* when header split is disabled, the header went to the packet
823 * buffer. If the packet buffer is a net_iov, those can't be
824 * easily mapped into the kernel space to access the header
825 * required to process the packet.
826 */
827 goto error;
828 }
829
830 /* Sync the portion of dma buffer for CPU to read. */
831 gve_dma_sync(priv, rx, buf_state, buf_len);
832
833 /* Append to current skb if one exists. */
834 if (rx->ctx.skb_head) {
835 if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
836 priv)) != 0) {
837 goto error;
838 }
839 return 0;
840 }
841
842 if (xprog) {
843 struct xdp_buff xdp;
844 void *old_data;
845 int xdp_act;
846
847 xdp_init_buff(&xdp, buf_state->page_info.buf_size,
848 &rx->xdp_rxq);
849 xdp_prepare_buff(&xdp,
850 buf_state->page_info.page_address +
851 buf_state->page_info.page_offset,
852 buf_state->page_info.pad,
853 buf_len, false);
854 old_data = xdp.data;
855 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
856 buf_state->page_info.pad += xdp.data - old_data;
857 buf_len = xdp.data_end - xdp.data;
858 if (xdp_act != XDP_PASS) {
859 gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act,
860 buf_state);
861 return 0;
862 }
863
864 u64_stats_update_begin(&rx->statss);
865 rx->xdp_actions[XDP_PASS]++;
866 u64_stats_update_end(&rx->statss);
867 }
868
869 if (eop && buf_len <= priv->rx_copybreak &&
870 !(rx->dqo.page_pool &&
871 netmem_is_net_iov(buf_state->page_info.netmem))) {
872 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
873 &buf_state->page_info, buf_len);
874 if (unlikely(!rx->ctx.skb_head))
875 goto error;
876 rx->ctx.skb_tail = rx->ctx.skb_head;
877
878 u64_stats_update_begin(&rx->statss);
879 rx->rx_copied_pkt++;
880 rx->rx_copybreak_pkt++;
881 u64_stats_update_end(&rx->statss);
882
883 gve_free_buffer(rx, buf_state);
884 return 0;
885 }
886
887 rx->ctx.skb_head = napi_get_frags(napi);
888 if (unlikely(!rx->ctx.skb_head))
889 goto error;
890 rx->ctx.skb_tail = rx->ctx.skb_head;
891
892 if (gve_rx_should_trigger_copy_ondemand(rx)) {
893 if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
894 goto error;
895 return 0;
896 }
897
898 if (rx->dqo.page_pool)
899 skb_mark_for_recycle(rx->ctx.skb_head);
900
901 gve_skb_add_rx_frag(rx, buf_state, 0, buf_len);
902 gve_reuse_buffer(rx, buf_state);
903 return 0;
904
905 error:
906 gve_free_buffer(rx, buf_state);
907 return -ENOMEM;
908 }
909
gve_rx_complete_rsc(struct sk_buff * skb,const struct gve_rx_compl_desc_dqo * desc,struct gve_ptype ptype)910 static int gve_rx_complete_rsc(struct sk_buff *skb,
911 const struct gve_rx_compl_desc_dqo *desc,
912 struct gve_ptype ptype)
913 {
914 struct skb_shared_info *shinfo = skb_shinfo(skb);
915
916 /* Only TCP is supported right now. */
917 if (ptype.l4_type != GVE_L4_TYPE_TCP)
918 return -EINVAL;
919
920 switch (ptype.l3_type) {
921 case GVE_L3_TYPE_IPV4:
922 shinfo->gso_type = SKB_GSO_TCPV4;
923 break;
924 case GVE_L3_TYPE_IPV6:
925 shinfo->gso_type = SKB_GSO_TCPV6;
926 break;
927 default:
928 return -EINVAL;
929 }
930
931 shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
932 return 0;
933 }
934
935 /* Returns 0 if skb is completed successfully, -1 otherwise. */
gve_rx_complete_skb(struct gve_rx_ring * rx,struct napi_struct * napi,const struct gve_rx_compl_desc_dqo * desc,netdev_features_t feat)936 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
937 const struct gve_rx_compl_desc_dqo *desc,
938 netdev_features_t feat)
939 {
940 struct gve_ptype ptype =
941 rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
942 int err;
943
944 skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
945
946 if (feat & NETIF_F_RXHASH)
947 gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
948
949 if (feat & NETIF_F_RXCSUM)
950 gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
951
952 if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
953 gve_rx_skb_hwtstamp(rx, desc);
954
955 /* RSC packets must set gso_size otherwise the TCP stack will complain
956 * that packets are larger than MTU.
957 */
958 if (desc->rsc) {
959 err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
960 if (err < 0)
961 return err;
962 }
963
964 if (skb_headlen(rx->ctx.skb_head) == 0)
965 napi_gro_frags(napi);
966 else
967 napi_gro_receive(napi, rx->ctx.skb_head);
968
969 return 0;
970 }
971
gve_rx_poll_dqo(struct gve_notify_block * block,int budget)972 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
973 {
974 struct gve_rx_compl_queue_dqo *complq;
975 struct napi_struct *napi;
976 netdev_features_t feat;
977 struct gve_rx_ring *rx;
978 struct gve_priv *priv;
979 u64 xdp_redirects;
980 u32 work_done = 0;
981 u64 bytes = 0;
982 u64 xdp_txs;
983 int err;
984
985 napi = &block->napi;
986 feat = napi->dev->features;
987
988 rx = block->rx;
989 priv = rx->gve;
990 complq = &rx->dqo.complq;
991
992 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
993 xdp_txs = rx->xdp_actions[XDP_TX];
994
995 while (work_done < budget) {
996 struct gve_rx_compl_desc_dqo *compl_desc =
997 &complq->desc_ring[complq->head];
998 u32 pkt_bytes;
999
1000 /* No more new packets */
1001 if (compl_desc->generation == complq->cur_gen_bit)
1002 break;
1003
1004 /* Prefetch the next two descriptors. */
1005 prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
1006 prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
1007
1008 /* Do not read data until we own the descriptor */
1009 dma_rmb();
1010
1011 err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
1012 if (err < 0) {
1013 gve_rx_free_skb(napi, rx);
1014 u64_stats_update_begin(&rx->statss);
1015 if (err == -ENOMEM)
1016 rx->rx_skb_alloc_fail++;
1017 else if (err == -EINVAL)
1018 rx->rx_desc_err_dropped_pkt++;
1019 u64_stats_update_end(&rx->statss);
1020 }
1021
1022 complq->head = (complq->head + 1) & complq->mask;
1023 complq->num_free_slots++;
1024
1025 /* When the ring wraps, the generation bit is flipped. */
1026 complq->cur_gen_bit ^= (complq->head == 0);
1027
1028 /* Receiving a completion means we have space to post another
1029 * buffer on the buffer queue.
1030 */
1031 {
1032 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
1033
1034 bufq->head = (bufq->head + 1) & bufq->mask;
1035 }
1036
1037 /* Free running counter of completed descriptors */
1038 rx->cnt++;
1039
1040 if (!rx->ctx.skb_head)
1041 continue;
1042
1043 if (!compl_desc->end_of_packet)
1044 continue;
1045
1046 work_done++;
1047 pkt_bytes = rx->ctx.skb_head->len;
1048 /* The ethernet header (first ETH_HLEN bytes) is snipped off
1049 * by eth_type_trans.
1050 */
1051 if (skb_headlen(rx->ctx.skb_head))
1052 pkt_bytes += ETH_HLEN;
1053
1054 /* gve_rx_complete_skb() will consume skb if successful */
1055 if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
1056 gve_rx_free_skb(napi, rx);
1057 u64_stats_update_begin(&rx->statss);
1058 rx->rx_desc_err_dropped_pkt++;
1059 u64_stats_update_end(&rx->statss);
1060 continue;
1061 }
1062
1063 bytes += pkt_bytes;
1064 rx->ctx.skb_head = NULL;
1065 rx->ctx.skb_tail = NULL;
1066 }
1067
1068 if (xdp_txs != rx->xdp_actions[XDP_TX])
1069 gve_xdp_tx_flush_dqo(priv, rx->q_num);
1070
1071 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1072 xdp_do_flush();
1073
1074 gve_rx_post_buffers_dqo(rx);
1075
1076 u64_stats_update_begin(&rx->statss);
1077 rx->rpackets += work_done;
1078 rx->rbytes += bytes;
1079 u64_stats_update_end(&rx->statss);
1080
1081 return work_done;
1082 }
1083