1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_dqo.h" 9 #include "gve_adminq.h" 10 #include "gve_utils.h" 11 #include <linux/bpf.h> 12 #include <linux/ip.h> 13 #include <linux/ipv6.h> 14 #include <linux/skbuff.h> 15 #include <linux/slab.h> 16 #include <net/ip6_checksum.h> 17 #include <net/ipv6.h> 18 #include <net/tcp.h> 19 #include <net/xdp_sock_drv.h> 20 21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) 22 { 23 struct device *hdev = &priv->pdev->dev; 24 int buf_count = rx->dqo.bufq.mask + 1; 25 26 if (rx->dqo.hdr_bufs.data) { 27 dma_free_coherent(hdev, priv->header_buf_size * buf_count, 28 rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr); 29 rx->dqo.hdr_bufs.data = NULL; 30 } 31 } 32 33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx, 34 const u32 buffer_queue_slots, 35 const u32 completion_queue_slots) 36 { 37 int i; 38 39 /* Set buffer queue state */ 40 rx->dqo.bufq.mask = buffer_queue_slots - 1; 41 rx->dqo.bufq.head = 0; 42 rx->dqo.bufq.tail = 0; 43 44 /* Set completion queue state */ 45 rx->dqo.complq.num_free_slots = completion_queue_slots; 46 rx->dqo.complq.mask = completion_queue_slots - 1; 47 rx->dqo.complq.cur_gen_bit = 0; 48 rx->dqo.complq.head = 0; 49 50 /* Set RX SKB context */ 51 rx->ctx.skb_head = NULL; 52 rx->ctx.skb_tail = NULL; 53 54 /* Set up linked list of buffer IDs */ 55 if (rx->dqo.buf_states) { 56 for (i = 0; i < rx->dqo.num_buf_states - 1; i++) 57 rx->dqo.buf_states[i].next = i + 1; 58 rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; 59 } 60 61 rx->dqo.free_buf_states = 0; 62 rx->dqo.recycled_buf_states.head = -1; 63 rx->dqo.recycled_buf_states.tail = -1; 64 rx->dqo.used_buf_states.head = -1; 65 rx->dqo.used_buf_states.tail = -1; 66 } 67 68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) 69 { 70 struct gve_rx_ring *rx = &priv->rx[idx]; 71 size_t size; 72 int i; 73 74 const u32 buffer_queue_slots = priv->rx_desc_cnt; 75 const u32 completion_queue_slots = priv->rx_desc_cnt; 76 77 /* Reset buffer queue */ 78 if (rx->dqo.bufq.desc_ring) { 79 size = sizeof(rx->dqo.bufq.desc_ring[0]) * 80 buffer_queue_slots; 81 memset(rx->dqo.bufq.desc_ring, 0, size); 82 } 83 84 /* Reset completion queue */ 85 if (rx->dqo.complq.desc_ring) { 86 size = sizeof(rx->dqo.complq.desc_ring[0]) * 87 completion_queue_slots; 88 memset(rx->dqo.complq.desc_ring, 0, size); 89 } 90 91 /* Reset q_resources */ 92 if (rx->q_resources) 93 memset(rx->q_resources, 0, sizeof(*rx->q_resources)); 94 95 /* Reset buf states */ 96 if (rx->dqo.buf_states) { 97 for (i = 0; i < rx->dqo.num_buf_states; i++) { 98 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 99 100 if (rx->dqo.page_pool) 101 gve_free_to_page_pool(rx, bs, false); 102 else 103 gve_free_qpl_page_dqo(bs); 104 } 105 } 106 107 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 108 completion_queue_slots); 109 } 110 111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) 112 { 113 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 114 struct gve_rx_ring *rx = &priv->rx[idx]; 115 116 if (!gve_rx_was_added_to_block(priv, idx)) 117 return; 118 119 if (rx->dqo.page_pool) 120 page_pool_disable_direct_recycling(rx->dqo.page_pool); 121 gve_remove_napi(priv, ntfy_idx); 122 gve_rx_remove_from_block(priv, idx); 123 gve_rx_reset_ring_dqo(priv, idx); 124 } 125 126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 127 struct gve_rx_alloc_rings_cfg *cfg) 128 { 129 struct device *hdev = &priv->pdev->dev; 130 size_t completion_queue_slots; 131 size_t buffer_queue_slots; 132 int idx = rx->q_num; 133 size_t size; 134 u32 qpl_id; 135 int i; 136 137 completion_queue_slots = rx->dqo.complq.mask + 1; 138 buffer_queue_slots = rx->dqo.bufq.mask + 1; 139 140 if (rx->q_resources) { 141 dma_free_coherent(hdev, sizeof(*rx->q_resources), 142 rx->q_resources, rx->q_resources_bus); 143 rx->q_resources = NULL; 144 } 145 146 for (i = 0; i < rx->dqo.num_buf_states; i++) { 147 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 148 149 if (rx->dqo.page_pool) 150 gve_free_to_page_pool(rx, bs, false); 151 else 152 gve_free_qpl_page_dqo(bs); 153 if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) { 154 xsk_buff_free(bs->xsk_buff); 155 bs->xsk_buff = NULL; 156 } 157 } 158 159 if (rx->dqo.qpl) { 160 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 161 gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id); 162 rx->dqo.qpl = NULL; 163 } 164 165 if (rx->dqo.bufq.desc_ring) { 166 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 167 dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, 168 rx->dqo.bufq.bus); 169 rx->dqo.bufq.desc_ring = NULL; 170 } 171 172 if (rx->dqo.complq.desc_ring) { 173 size = sizeof(rx->dqo.complq.desc_ring[0]) * 174 completion_queue_slots; 175 dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, 176 rx->dqo.complq.bus); 177 rx->dqo.complq.desc_ring = NULL; 178 } 179 180 kvfree(rx->dqo.buf_states); 181 rx->dqo.buf_states = NULL; 182 183 if (rx->dqo.page_pool) { 184 page_pool_destroy(rx->dqo.page_pool); 185 rx->dqo.page_pool = NULL; 186 } 187 188 gve_rx_free_hdr_bufs(priv, rx); 189 190 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 191 } 192 193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx, 194 const u32 buf_count) 195 { 196 struct device *hdev = &priv->pdev->dev; 197 198 rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count, 199 &rx->dqo.hdr_bufs.addr, GFP_KERNEL); 200 if (!rx->dqo.hdr_bufs.data) 201 return -ENOMEM; 202 203 return 0; 204 } 205 206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx) 207 { 208 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 209 210 gve_rx_add_to_block(priv, idx); 211 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo); 212 } 213 214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv, 215 struct gve_rx_alloc_rings_cfg *cfg, 216 struct gve_rx_ring *rx, 217 int idx) 218 { 219 struct device *hdev = &priv->pdev->dev; 220 struct page_pool *pool; 221 int qpl_page_cnt; 222 size_t size; 223 u32 qpl_id; 224 225 const u32 buffer_queue_slots = cfg->ring_size; 226 const u32 completion_queue_slots = cfg->ring_size; 227 228 netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); 229 230 memset(rx, 0, sizeof(*rx)); 231 rx->gve = priv; 232 rx->q_num = idx; 233 rx->packet_buffer_size = cfg->packet_buffer_size; 234 235 if (cfg->xdp) { 236 rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO; 237 rx->rx_headroom = XDP_PACKET_HEADROOM; 238 } else { 239 rx->packet_buffer_truesize = rx->packet_buffer_size; 240 rx->rx_headroom = 0; 241 } 242 243 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : 244 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 245 rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states, 246 sizeof(rx->dqo.buf_states[0]), 247 GFP_KERNEL, priv->numa_node); 248 if (!rx->dqo.buf_states) 249 return -ENOMEM; 250 251 /* Allocate header buffers for header-split */ 252 if (cfg->enable_header_split) 253 if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots)) 254 goto err; 255 256 /* Allocate RX completion queue */ 257 size = sizeof(rx->dqo.complq.desc_ring[0]) * 258 completion_queue_slots; 259 rx->dqo.complq.desc_ring = 260 dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); 261 if (!rx->dqo.complq.desc_ring) 262 goto err; 263 264 /* Allocate RX buffer queue */ 265 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 266 rx->dqo.bufq.desc_ring = 267 dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); 268 if (!rx->dqo.bufq.desc_ring) 269 goto err; 270 271 if (cfg->raw_addressing) { 272 pool = gve_rx_create_page_pool(priv, rx, cfg->xdp); 273 if (IS_ERR(pool)) 274 goto err; 275 276 rx->dqo.page_pool = pool; 277 } else { 278 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 279 qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 280 281 rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id, 282 qpl_page_cnt); 283 if (!rx->dqo.qpl) 284 goto err; 285 rx->dqo.next_qpl_page_idx = 0; 286 } 287 288 rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), 289 &rx->q_resources_bus, GFP_KERNEL); 290 if (!rx->q_resources) 291 goto err; 292 293 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 294 completion_queue_slots); 295 296 return 0; 297 298 err: 299 gve_rx_free_ring_dqo(priv, rx, cfg); 300 return -ENOMEM; 301 } 302 303 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) 304 { 305 const struct gve_rx_ring *rx = &priv->rx[queue_idx]; 306 u64 index = be32_to_cpu(rx->q_resources->db_index); 307 308 iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); 309 } 310 311 int gve_rx_alloc_rings_dqo(struct gve_priv *priv, 312 struct gve_rx_alloc_rings_cfg *cfg) 313 { 314 struct gve_rx_ring *rx; 315 int err; 316 int i; 317 318 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring), 319 GFP_KERNEL); 320 if (!rx) 321 return -ENOMEM; 322 323 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) { 324 err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i); 325 if (err) { 326 netif_err(priv, drv, priv->dev, 327 "Failed to alloc rx ring=%d: err=%d\n", 328 i, err); 329 goto err; 330 } 331 } 332 333 cfg->rx = rx; 334 return 0; 335 336 err: 337 for (i--; i >= 0; i--) 338 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 339 kvfree(rx); 340 return err; 341 } 342 343 void gve_rx_free_rings_dqo(struct gve_priv *priv, 344 struct gve_rx_alloc_rings_cfg *cfg) 345 { 346 struct gve_rx_ring *rx = cfg->rx; 347 int i; 348 349 if (!rx) 350 return; 351 352 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) 353 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 354 355 kvfree(rx); 356 cfg->rx = NULL; 357 } 358 359 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) 360 { 361 struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; 362 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 363 struct gve_priv *priv = rx->gve; 364 u32 num_avail_slots; 365 u32 num_full_slots; 366 u32 num_posted = 0; 367 368 num_full_slots = (bufq->tail - bufq->head) & bufq->mask; 369 num_avail_slots = bufq->mask - num_full_slots; 370 371 num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); 372 while (num_posted < num_avail_slots) { 373 struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; 374 375 if (unlikely(gve_alloc_buffer(rx, desc))) { 376 u64_stats_update_begin(&rx->statss); 377 rx->rx_buf_alloc_fail++; 378 u64_stats_update_end(&rx->statss); 379 break; 380 } 381 382 if (rx->dqo.hdr_bufs.data) 383 desc->header_buf_addr = 384 cpu_to_le64(rx->dqo.hdr_bufs.addr + 385 priv->header_buf_size * bufq->tail); 386 387 bufq->tail = (bufq->tail + 1) & bufq->mask; 388 complq->num_free_slots--; 389 num_posted++; 390 391 if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) 392 gve_rx_write_doorbell_dqo(priv, rx->q_num); 393 } 394 395 rx->fill_cnt += num_posted; 396 } 397 398 static void gve_rx_skb_csum(struct sk_buff *skb, 399 const struct gve_rx_compl_desc_dqo *desc, 400 struct gve_ptype ptype) 401 { 402 skb->ip_summed = CHECKSUM_NONE; 403 404 /* HW did not identify and process L3 and L4 headers. */ 405 if (unlikely(!desc->l3_l4_processed)) 406 return; 407 408 if (ptype.l3_type == GVE_L3_TYPE_IPV4) { 409 if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) 410 return; 411 } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { 412 /* Checksum should be skipped if this flag is set. */ 413 if (unlikely(desc->ipv6_ex_add)) 414 return; 415 } 416 417 if (unlikely(desc->csum_l4_err)) 418 return; 419 420 switch (ptype.l4_type) { 421 case GVE_L4_TYPE_TCP: 422 case GVE_L4_TYPE_UDP: 423 case GVE_L4_TYPE_ICMP: 424 case GVE_L4_TYPE_SCTP: 425 skb->ip_summed = CHECKSUM_UNNECESSARY; 426 break; 427 default: 428 break; 429 } 430 } 431 432 static void gve_rx_skb_hash(struct sk_buff *skb, 433 const struct gve_rx_compl_desc_dqo *compl_desc, 434 struct gve_ptype ptype) 435 { 436 enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; 437 438 if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) 439 hash_type = PKT_HASH_TYPE_L4; 440 else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) 441 hash_type = PKT_HASH_TYPE_L3; 442 443 skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); 444 } 445 446 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the 447 * skb. 448 * 449 * This algorithm works by using the passed hardware timestamp to generate a 450 * diff relative to the last read of the nic clock. This diff can be positive or 451 * negative, as it is possible that we have read the clock more recently than 452 * the hardware has received this packet. To detect this, we use the high bit of 453 * the diff, and assume that the read is more recent if the high bit is set. In 454 * this case we invert the process. 455 * 456 * Note that this means if the time delta between packet reception and the last 457 * clock read is greater than ~2 seconds, this will provide invalid results. 458 */ 459 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, u32 hwts) 460 { 461 u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter); 462 struct sk_buff *skb = rx->ctx.skb_head; 463 u32 low = (u32)last_read; 464 s32 diff = hwts - low; 465 466 skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff); 467 } 468 469 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx) 470 { 471 if (!rx->ctx.skb_head) 472 return; 473 474 if (rx->ctx.skb_head == napi->skb) 475 napi->skb = NULL; 476 dev_kfree_skb_any(rx->ctx.skb_head); 477 rx->ctx.skb_head = NULL; 478 rx->ctx.skb_tail = NULL; 479 } 480 481 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx) 482 { 483 if (!rx->dqo.qpl) 484 return false; 485 if (rx->dqo.used_buf_states_cnt < 486 (rx->dqo.num_buf_states - 487 GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD)) 488 return false; 489 return true; 490 } 491 492 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx, 493 struct gve_rx_buf_state_dqo *buf_state, 494 u16 buf_len) 495 { 496 struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0); 497 int num_frags; 498 499 if (!page) 500 return -ENOMEM; 501 502 memcpy(page_address(page), 503 buf_state->page_info.page_address + 504 buf_state->page_info.page_offset, 505 buf_len); 506 num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 507 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page, 508 0, buf_len, PAGE_SIZE); 509 510 u64_stats_update_begin(&rx->statss); 511 rx->rx_frag_alloc_cnt++; 512 u64_stats_update_end(&rx->statss); 513 /* Return unused buffer. */ 514 gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); 515 return 0; 516 } 517 518 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx, 519 struct gve_rx_buf_state_dqo *buf_state, 520 int num_frags, u16 buf_len) 521 { 522 if (rx->dqo.page_pool) { 523 skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags, 524 buf_state->page_info.netmem, 525 buf_state->page_info.page_offset + 526 buf_state->page_info.pad, buf_len, 527 buf_state->page_info.buf_size); 528 } else { 529 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, 530 buf_state->page_info.page, 531 buf_state->page_info.page_offset + 532 buf_state->page_info.pad, buf_len, 533 buf_state->page_info.buf_size); 534 } 535 } 536 537 /* Chains multi skbs for single rx packet. 538 * Returns 0 if buffer is appended, -1 otherwise. 539 */ 540 static int gve_rx_append_frags(struct napi_struct *napi, 541 struct gve_rx_buf_state_dqo *buf_state, 542 u16 buf_len, struct gve_rx_ring *rx, 543 struct gve_priv *priv) 544 { 545 int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 546 547 if (unlikely(num_frags == MAX_SKB_FRAGS)) { 548 struct sk_buff *skb; 549 550 skb = napi_alloc_skb(napi, 0); 551 if (!skb) 552 return -1; 553 554 if (rx->dqo.page_pool) 555 skb_mark_for_recycle(skb); 556 557 if (rx->ctx.skb_tail == rx->ctx.skb_head) 558 skb_shinfo(rx->ctx.skb_head)->frag_list = skb; 559 else 560 rx->ctx.skb_tail->next = skb; 561 rx->ctx.skb_tail = skb; 562 num_frags = 0; 563 } 564 if (rx->ctx.skb_tail != rx->ctx.skb_head) { 565 rx->ctx.skb_head->len += buf_len; 566 rx->ctx.skb_head->data_len += buf_len; 567 rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; 568 } 569 570 /* Trigger ondemand page allocation if we are running low on buffers */ 571 if (gve_rx_should_trigger_copy_ondemand(rx)) 572 return gve_rx_copy_ondemand(rx, buf_state, buf_len); 573 574 gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len); 575 gve_reuse_buffer(rx, buf_state); 576 return 0; 577 } 578 579 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 580 struct xdp_buff *xdp) 581 { 582 struct gve_tx_ring *tx; 583 struct xdp_frame *xdpf; 584 u32 tx_qid; 585 int err; 586 587 xdpf = xdp_convert_buff_to_frame(xdp); 588 if (unlikely(!xdpf)) { 589 if (rx->xsk_pool) 590 xsk_buff_free(xdp); 591 return -ENOSPC; 592 } 593 594 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 595 tx = &priv->tx[tx_qid]; 596 spin_lock(&tx->dqo_tx.xdp_lock); 597 err = gve_xdp_xmit_one_dqo(priv, tx, xdpf); 598 spin_unlock(&tx->dqo_tx.xdp_lock); 599 600 return err; 601 } 602 603 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 604 struct xdp_buff *xdp, struct bpf_prog *xprog, 605 int xdp_act) 606 { 607 switch (xdp_act) { 608 case XDP_ABORTED: 609 case XDP_DROP: 610 default: 611 xsk_buff_free(xdp); 612 break; 613 case XDP_TX: 614 if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp))) 615 goto err; 616 break; 617 case XDP_REDIRECT: 618 if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog))) 619 goto err; 620 break; 621 } 622 623 u64_stats_update_begin(&rx->statss); 624 if ((u32)xdp_act < GVE_XDP_ACTIONS) 625 rx->xdp_actions[xdp_act]++; 626 u64_stats_update_end(&rx->statss); 627 return; 628 629 err: 630 u64_stats_update_begin(&rx->statss); 631 if (xdp_act == XDP_TX) 632 rx->xdp_tx_errors++; 633 if (xdp_act == XDP_REDIRECT) 634 rx->xdp_redirect_errors++; 635 u64_stats_update_end(&rx->statss); 636 } 637 638 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 639 struct xdp_buff *xdp, struct bpf_prog *xprog, 640 int xdp_act, 641 struct gve_rx_buf_state_dqo *buf_state) 642 { 643 int err; 644 switch (xdp_act) { 645 case XDP_ABORTED: 646 case XDP_DROP: 647 default: 648 gve_free_buffer(rx, buf_state); 649 break; 650 case XDP_TX: 651 err = gve_xdp_tx_dqo(priv, rx, xdp); 652 if (unlikely(err)) 653 goto err; 654 gve_reuse_buffer(rx, buf_state); 655 break; 656 case XDP_REDIRECT: 657 err = xdp_do_redirect(priv->dev, xdp, xprog); 658 if (unlikely(err)) 659 goto err; 660 gve_reuse_buffer(rx, buf_state); 661 break; 662 } 663 u64_stats_update_begin(&rx->statss); 664 if ((u32)xdp_act < GVE_XDP_ACTIONS) 665 rx->xdp_actions[xdp_act]++; 666 u64_stats_update_end(&rx->statss); 667 return; 668 err: 669 u64_stats_update_begin(&rx->statss); 670 if (xdp_act == XDP_TX) 671 rx->xdp_tx_errors++; 672 else if (xdp_act == XDP_REDIRECT) 673 rx->xdp_redirect_errors++; 674 u64_stats_update_end(&rx->statss); 675 gve_free_buffer(rx, buf_state); 676 return; 677 } 678 679 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 680 struct gve_rx_buf_state_dqo *buf_state, int buf_len, 681 struct bpf_prog *xprog) 682 { 683 struct xdp_buff *xdp = buf_state->xsk_buff; 684 struct gve_priv *priv = rx->gve; 685 int xdp_act; 686 687 xdp->data_end = xdp->data + buf_len; 688 xsk_buff_dma_sync_for_cpu(xdp); 689 690 if (xprog) { 691 xdp_act = bpf_prog_run_xdp(xprog, xdp); 692 buf_len = xdp->data_end - xdp->data; 693 if (xdp_act != XDP_PASS) { 694 gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act); 695 gve_free_buf_state(rx, buf_state); 696 return 0; 697 } 698 } 699 700 /* Copy the data to skb */ 701 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 702 xdp->data, buf_len); 703 if (unlikely(!rx->ctx.skb_head)) { 704 xsk_buff_free(xdp); 705 gve_free_buf_state(rx, buf_state); 706 return -ENOMEM; 707 } 708 rx->ctx.skb_tail = rx->ctx.skb_head; 709 710 /* Free XSK buffer and Buffer state */ 711 xsk_buff_free(xdp); 712 gve_free_buf_state(rx, buf_state); 713 714 /* Update Stats */ 715 u64_stats_update_begin(&rx->statss); 716 rx->xdp_actions[XDP_PASS]++; 717 u64_stats_update_end(&rx->statss); 718 return 0; 719 } 720 721 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx, 722 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len) 723 { 724 struct gve_rx_slot_page_info *page_info = &buf_state->page_info; 725 726 if (rx->dqo.page_pool) { 727 page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool, 728 page_info->netmem, 729 page_info->page_offset, 730 buf_len); 731 } else { 732 dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, 733 page_info->page_offset + 734 page_info->pad, 735 buf_len, DMA_FROM_DEVICE); 736 } 737 } 738 739 /* Returns 0 if descriptor is completed successfully. 740 * Returns -EINVAL if descriptor is invalid. 741 * Returns -ENOMEM if data cannot be copied to skb. 742 */ 743 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 744 const struct gve_rx_compl_desc_dqo *compl_desc, 745 u32 desc_idx, int queue_idx) 746 { 747 const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); 748 const bool hbo = compl_desc->header_buffer_overflow; 749 const bool eop = compl_desc->end_of_packet != 0; 750 const bool hsplit = compl_desc->split_header; 751 struct gve_rx_buf_state_dqo *buf_state; 752 struct gve_priv *priv = rx->gve; 753 struct bpf_prog *xprog; 754 u16 buf_len; 755 u16 hdr_len; 756 757 if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { 758 net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", 759 priv->dev->name, buffer_id); 760 return -EINVAL; 761 } 762 buf_state = &rx->dqo.buf_states[buffer_id]; 763 if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { 764 net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", 765 priv->dev->name, buffer_id); 766 return -EINVAL; 767 } 768 769 if (unlikely(compl_desc->rx_error)) { 770 gve_free_buffer(rx, buf_state); 771 return -EINVAL; 772 } 773 774 buf_len = compl_desc->packet_len; 775 hdr_len = compl_desc->header_len; 776 777 xprog = READ_ONCE(priv->xdp_prog); 778 if (buf_state->xsk_buff) 779 return gve_rx_xsk_dqo(napi, rx, buf_state, buf_len, xprog); 780 781 /* Page might have not been used for awhile and was likely last written 782 * by a different thread. 783 */ 784 if (rx->dqo.page_pool) { 785 if (!netmem_is_net_iov(buf_state->page_info.netmem)) 786 prefetch(netmem_to_page(buf_state->page_info.netmem)); 787 } else { 788 prefetch(buf_state->page_info.page); 789 } 790 791 /* Copy the header into the skb in the case of header split */ 792 if (hsplit) { 793 int unsplit = 0; 794 795 if (hdr_len && !hbo) { 796 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 797 rx->dqo.hdr_bufs.data + 798 desc_idx * priv->header_buf_size, 799 hdr_len); 800 if (unlikely(!rx->ctx.skb_head)) 801 goto error; 802 rx->ctx.skb_tail = rx->ctx.skb_head; 803 804 if (rx->dqo.page_pool) 805 skb_mark_for_recycle(rx->ctx.skb_head); 806 } else { 807 unsplit = 1; 808 } 809 u64_stats_update_begin(&rx->statss); 810 rx->rx_hsplit_pkt++; 811 rx->rx_hsplit_unsplit_pkt += unsplit; 812 rx->rx_hsplit_bytes += hdr_len; 813 u64_stats_update_end(&rx->statss); 814 } else if (!rx->ctx.skb_head && rx->dqo.page_pool && 815 netmem_is_net_iov(buf_state->page_info.netmem)) { 816 /* when header split is disabled, the header went to the packet 817 * buffer. If the packet buffer is a net_iov, those can't be 818 * easily mapped into the kernel space to access the header 819 * required to process the packet. 820 */ 821 goto error; 822 } 823 824 /* Sync the portion of dma buffer for CPU to read. */ 825 gve_dma_sync(priv, rx, buf_state, buf_len); 826 827 /* Append to current skb if one exists. */ 828 if (rx->ctx.skb_head) { 829 if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, 830 priv)) != 0) { 831 goto error; 832 } 833 return 0; 834 } 835 836 if (xprog) { 837 struct xdp_buff xdp; 838 void *old_data; 839 int xdp_act; 840 841 xdp_init_buff(&xdp, buf_state->page_info.buf_size, 842 &rx->xdp_rxq); 843 xdp_prepare_buff(&xdp, 844 buf_state->page_info.page_address + 845 buf_state->page_info.page_offset, 846 buf_state->page_info.pad, 847 buf_len, false); 848 old_data = xdp.data; 849 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 850 buf_state->page_info.pad += xdp.data - old_data; 851 buf_len = xdp.data_end - xdp.data; 852 if (xdp_act != XDP_PASS) { 853 gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act, 854 buf_state); 855 return 0; 856 } 857 858 u64_stats_update_begin(&rx->statss); 859 rx->xdp_actions[XDP_PASS]++; 860 u64_stats_update_end(&rx->statss); 861 } 862 863 if (eop && buf_len <= priv->rx_copybreak && 864 !(rx->dqo.page_pool && 865 netmem_is_net_iov(buf_state->page_info.netmem))) { 866 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, 867 &buf_state->page_info, buf_len); 868 if (unlikely(!rx->ctx.skb_head)) 869 goto error; 870 rx->ctx.skb_tail = rx->ctx.skb_head; 871 872 u64_stats_update_begin(&rx->statss); 873 rx->rx_copied_pkt++; 874 rx->rx_copybreak_pkt++; 875 u64_stats_update_end(&rx->statss); 876 877 gve_free_buffer(rx, buf_state); 878 return 0; 879 } 880 881 rx->ctx.skb_head = napi_get_frags(napi); 882 if (unlikely(!rx->ctx.skb_head)) 883 goto error; 884 rx->ctx.skb_tail = rx->ctx.skb_head; 885 886 if (gve_rx_should_trigger_copy_ondemand(rx)) { 887 if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0) 888 goto error; 889 return 0; 890 } 891 892 if (rx->dqo.page_pool) 893 skb_mark_for_recycle(rx->ctx.skb_head); 894 895 gve_skb_add_rx_frag(rx, buf_state, 0, buf_len); 896 gve_reuse_buffer(rx, buf_state); 897 return 0; 898 899 error: 900 gve_free_buffer(rx, buf_state); 901 return -ENOMEM; 902 } 903 904 static int gve_rx_complete_rsc(struct sk_buff *skb, 905 const struct gve_rx_compl_desc_dqo *desc, 906 struct gve_ptype ptype) 907 { 908 struct skb_shared_info *shinfo = skb_shinfo(skb); 909 910 /* Only TCP is supported right now. */ 911 if (ptype.l4_type != GVE_L4_TYPE_TCP) 912 return -EINVAL; 913 914 switch (ptype.l3_type) { 915 case GVE_L3_TYPE_IPV4: 916 shinfo->gso_type = SKB_GSO_TCPV4; 917 break; 918 case GVE_L3_TYPE_IPV6: 919 shinfo->gso_type = SKB_GSO_TCPV6; 920 break; 921 default: 922 return -EINVAL; 923 } 924 925 shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); 926 return 0; 927 } 928 929 /* Returns 0 if skb is completed successfully, -1 otherwise. */ 930 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, 931 const struct gve_rx_compl_desc_dqo *desc, 932 netdev_features_t feat) 933 { 934 struct gve_ptype ptype = 935 rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; 936 int err; 937 938 skb_record_rx_queue(rx->ctx.skb_head, rx->q_num); 939 940 if (feat & NETIF_F_RXHASH) 941 gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype); 942 943 if (feat & NETIF_F_RXCSUM) 944 gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype); 945 946 if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL) 947 gve_rx_skb_hwtstamp(rx, le32_to_cpu(desc->ts)); 948 949 /* RSC packets must set gso_size otherwise the TCP stack will complain 950 * that packets are larger than MTU. 951 */ 952 if (desc->rsc) { 953 err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype); 954 if (err < 0) 955 return err; 956 } 957 958 if (skb_headlen(rx->ctx.skb_head) == 0) 959 napi_gro_frags(napi); 960 else 961 napi_gro_receive(napi, rx->ctx.skb_head); 962 963 return 0; 964 } 965 966 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) 967 { 968 struct gve_rx_compl_queue_dqo *complq; 969 struct napi_struct *napi; 970 netdev_features_t feat; 971 struct gve_rx_ring *rx; 972 struct gve_priv *priv; 973 u64 xdp_redirects; 974 u32 work_done = 0; 975 u64 bytes = 0; 976 u64 xdp_txs; 977 int err; 978 979 napi = &block->napi; 980 feat = napi->dev->features; 981 982 rx = block->rx; 983 priv = rx->gve; 984 complq = &rx->dqo.complq; 985 986 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 987 xdp_txs = rx->xdp_actions[XDP_TX]; 988 989 while (work_done < budget) { 990 struct gve_rx_compl_desc_dqo *compl_desc = 991 &complq->desc_ring[complq->head]; 992 u32 pkt_bytes; 993 994 /* No more new packets */ 995 if (compl_desc->generation == complq->cur_gen_bit) 996 break; 997 998 /* Prefetch the next two descriptors. */ 999 prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); 1000 prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); 1001 1002 /* Do not read data until we own the descriptor */ 1003 dma_rmb(); 1004 1005 err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num); 1006 if (err < 0) { 1007 gve_rx_free_skb(napi, rx); 1008 u64_stats_update_begin(&rx->statss); 1009 if (err == -ENOMEM) 1010 rx->rx_skb_alloc_fail++; 1011 else if (err == -EINVAL) 1012 rx->rx_desc_err_dropped_pkt++; 1013 u64_stats_update_end(&rx->statss); 1014 } 1015 1016 complq->head = (complq->head + 1) & complq->mask; 1017 complq->num_free_slots++; 1018 1019 /* When the ring wraps, the generation bit is flipped. */ 1020 complq->cur_gen_bit ^= (complq->head == 0); 1021 1022 /* Receiving a completion means we have space to post another 1023 * buffer on the buffer queue. 1024 */ 1025 { 1026 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 1027 1028 bufq->head = (bufq->head + 1) & bufq->mask; 1029 } 1030 1031 /* Free running counter of completed descriptors */ 1032 rx->cnt++; 1033 1034 if (!rx->ctx.skb_head) 1035 continue; 1036 1037 if (!compl_desc->end_of_packet) 1038 continue; 1039 1040 work_done++; 1041 pkt_bytes = rx->ctx.skb_head->len; 1042 /* The ethernet header (first ETH_HLEN bytes) is snipped off 1043 * by eth_type_trans. 1044 */ 1045 if (skb_headlen(rx->ctx.skb_head)) 1046 pkt_bytes += ETH_HLEN; 1047 1048 /* gve_rx_complete_skb() will consume skb if successful */ 1049 if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { 1050 gve_rx_free_skb(napi, rx); 1051 u64_stats_update_begin(&rx->statss); 1052 rx->rx_desc_err_dropped_pkt++; 1053 u64_stats_update_end(&rx->statss); 1054 continue; 1055 } 1056 1057 bytes += pkt_bytes; 1058 rx->ctx.skb_head = NULL; 1059 rx->ctx.skb_tail = NULL; 1060 } 1061 1062 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1063 gve_xdp_tx_flush_dqo(priv, rx->q_num); 1064 1065 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1066 xdp_do_flush(); 1067 1068 gve_rx_post_buffers_dqo(rx); 1069 1070 u64_stats_update_begin(&rx->statss); 1071 rx->rpackets += work_done; 1072 rx->rbytes += bytes; 1073 u64_stats_update_end(&rx->statss); 1074 1075 return work_done; 1076 } 1077