1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_dqo.h" 9 #include "gve_adminq.h" 10 #include "gve_utils.h" 11 #include <linux/bpf.h> 12 #include <linux/ip.h> 13 #include <linux/ipv6.h> 14 #include <linux/skbuff.h> 15 #include <linux/slab.h> 16 #include <net/ip6_checksum.h> 17 #include <net/ipv6.h> 18 #include <net/tcp.h> 19 #include <net/xdp_sock_drv.h> 20 21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) 22 { 23 struct device *hdev = &priv->pdev->dev; 24 int buf_count = rx->dqo.bufq.mask + 1; 25 26 if (rx->dqo.hdr_bufs.data) { 27 dma_free_coherent(hdev, priv->header_buf_size * buf_count, 28 rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr); 29 rx->dqo.hdr_bufs.data = NULL; 30 } 31 } 32 33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx, 34 const u32 buffer_queue_slots, 35 const u32 completion_queue_slots) 36 { 37 int i; 38 39 /* Set buffer queue state */ 40 rx->dqo.bufq.mask = buffer_queue_slots - 1; 41 rx->dqo.bufq.head = 0; 42 rx->dqo.bufq.tail = 0; 43 44 /* Set completion queue state */ 45 rx->dqo.complq.num_free_slots = completion_queue_slots; 46 rx->dqo.complq.mask = completion_queue_slots - 1; 47 rx->dqo.complq.cur_gen_bit = 0; 48 rx->dqo.complq.head = 0; 49 50 /* Set RX SKB context */ 51 rx->ctx.skb_head = NULL; 52 rx->ctx.skb_tail = NULL; 53 54 /* Set up linked list of buffer IDs */ 55 if (rx->dqo.buf_states) { 56 for (i = 0; i < rx->dqo.num_buf_states - 1; i++) 57 rx->dqo.buf_states[i].next = i + 1; 58 rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; 59 } 60 61 rx->dqo.free_buf_states = 0; 62 rx->dqo.recycled_buf_states.head = -1; 63 rx->dqo.recycled_buf_states.tail = -1; 64 rx->dqo.used_buf_states.head = -1; 65 rx->dqo.used_buf_states.tail = -1; 66 } 67 68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) 69 { 70 struct gve_rx_ring *rx = &priv->rx[idx]; 71 size_t size; 72 int i; 73 74 const u32 buffer_queue_slots = priv->rx_desc_cnt; 75 const u32 completion_queue_slots = priv->rx_desc_cnt; 76 77 /* Reset buffer queue */ 78 if (rx->dqo.bufq.desc_ring) { 79 size = sizeof(rx->dqo.bufq.desc_ring[0]) * 80 buffer_queue_slots; 81 memset(rx->dqo.bufq.desc_ring, 0, size); 82 } 83 84 /* Reset completion queue */ 85 if (rx->dqo.complq.desc_ring) { 86 size = sizeof(rx->dqo.complq.desc_ring[0]) * 87 completion_queue_slots; 88 memset(rx->dqo.complq.desc_ring, 0, size); 89 } 90 91 /* Reset q_resources */ 92 if (rx->q_resources) 93 memset(rx->q_resources, 0, sizeof(*rx->q_resources)); 94 95 /* Reset buf states */ 96 if (rx->dqo.buf_states) { 97 for (i = 0; i < rx->dqo.num_buf_states; i++) { 98 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 99 100 if (rx->dqo.page_pool) 101 gve_free_to_page_pool(rx, bs, false); 102 else 103 gve_free_qpl_page_dqo(bs); 104 } 105 } 106 107 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 108 completion_queue_slots); 109 } 110 111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) 112 { 113 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 114 struct gve_rx_ring *rx = &priv->rx[idx]; 115 116 if (!gve_rx_was_added_to_block(priv, idx)) 117 return; 118 119 if (rx->dqo.page_pool) 120 page_pool_disable_direct_recycling(rx->dqo.page_pool); 121 gve_remove_napi(priv, ntfy_idx); 122 gve_rx_remove_from_block(priv, idx); 123 gve_rx_reset_ring_dqo(priv, idx); 124 } 125 126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 127 struct gve_rx_alloc_rings_cfg *cfg) 128 { 129 struct device *hdev = &priv->pdev->dev; 130 size_t completion_queue_slots; 131 size_t buffer_queue_slots; 132 int idx = rx->q_num; 133 size_t size; 134 u32 qpl_id; 135 int i; 136 137 completion_queue_slots = rx->dqo.complq.mask + 1; 138 buffer_queue_slots = rx->dqo.bufq.mask + 1; 139 140 if (rx->q_resources) { 141 dma_free_coherent(hdev, sizeof(*rx->q_resources), 142 rx->q_resources, rx->q_resources_bus); 143 rx->q_resources = NULL; 144 } 145 146 for (i = 0; i < rx->dqo.num_buf_states; i++) { 147 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 148 149 if (rx->dqo.page_pool) 150 gve_free_to_page_pool(rx, bs, false); 151 else 152 gve_free_qpl_page_dqo(bs); 153 if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) { 154 xsk_buff_free(bs->xsk_buff); 155 bs->xsk_buff = NULL; 156 } 157 } 158 159 if (rx->dqo.qpl) { 160 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 161 gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id); 162 rx->dqo.qpl = NULL; 163 } 164 165 if (rx->dqo.bufq.desc_ring) { 166 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 167 dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, 168 rx->dqo.bufq.bus); 169 rx->dqo.bufq.desc_ring = NULL; 170 } 171 172 if (rx->dqo.complq.desc_ring) { 173 size = sizeof(rx->dqo.complq.desc_ring[0]) * 174 completion_queue_slots; 175 dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, 176 rx->dqo.complq.bus); 177 rx->dqo.complq.desc_ring = NULL; 178 } 179 180 kvfree(rx->dqo.buf_states); 181 rx->dqo.buf_states = NULL; 182 183 if (rx->dqo.page_pool) { 184 page_pool_destroy(rx->dqo.page_pool); 185 rx->dqo.page_pool = NULL; 186 } 187 188 gve_rx_free_hdr_bufs(priv, rx); 189 190 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 191 } 192 193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx, 194 const u32 buf_count) 195 { 196 struct device *hdev = &priv->pdev->dev; 197 198 rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count, 199 &rx->dqo.hdr_bufs.addr, GFP_KERNEL); 200 if (!rx->dqo.hdr_bufs.data) 201 return -ENOMEM; 202 203 return 0; 204 } 205 206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx) 207 { 208 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 209 210 gve_rx_add_to_block(priv, idx); 211 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo); 212 } 213 214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv, 215 struct gve_rx_alloc_rings_cfg *cfg, 216 struct gve_rx_ring *rx, 217 int idx) 218 { 219 struct device *hdev = &priv->pdev->dev; 220 struct page_pool *pool; 221 int qpl_page_cnt; 222 size_t size; 223 u32 qpl_id; 224 225 const u32 buffer_queue_slots = cfg->ring_size; 226 const u32 completion_queue_slots = cfg->ring_size; 227 228 netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); 229 230 memset(rx, 0, sizeof(*rx)); 231 rx->gve = priv; 232 rx->q_num = idx; 233 rx->packet_buffer_size = cfg->packet_buffer_size; 234 235 if (cfg->xdp) { 236 rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO; 237 rx->rx_headroom = XDP_PACKET_HEADROOM; 238 } else { 239 rx->packet_buffer_truesize = rx->packet_buffer_size; 240 rx->rx_headroom = 0; 241 } 242 243 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : 244 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 245 rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states, 246 sizeof(rx->dqo.buf_states[0]), 247 GFP_KERNEL, priv->numa_node); 248 if (!rx->dqo.buf_states) 249 return -ENOMEM; 250 251 /* Allocate header buffers for header-split */ 252 if (cfg->enable_header_split) 253 if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots)) 254 goto err; 255 256 /* Allocate RX completion queue */ 257 size = sizeof(rx->dqo.complq.desc_ring[0]) * 258 completion_queue_slots; 259 rx->dqo.complq.desc_ring = 260 dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); 261 if (!rx->dqo.complq.desc_ring) 262 goto err; 263 264 /* Allocate RX buffer queue */ 265 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 266 rx->dqo.bufq.desc_ring = 267 dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); 268 if (!rx->dqo.bufq.desc_ring) 269 goto err; 270 271 if (cfg->raw_addressing) { 272 pool = gve_rx_create_page_pool(priv, rx, cfg->xdp); 273 if (IS_ERR(pool)) 274 goto err; 275 276 rx->dqo.page_pool = pool; 277 } else { 278 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 279 qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 280 281 rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id, 282 qpl_page_cnt); 283 if (!rx->dqo.qpl) 284 goto err; 285 rx->dqo.next_qpl_page_idx = 0; 286 } 287 288 rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), 289 &rx->q_resources_bus, GFP_KERNEL); 290 if (!rx->q_resources) 291 goto err; 292 293 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 294 completion_queue_slots); 295 296 return 0; 297 298 err: 299 gve_rx_free_ring_dqo(priv, rx, cfg); 300 return -ENOMEM; 301 } 302 303 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) 304 { 305 const struct gve_rx_ring *rx = &priv->rx[queue_idx]; 306 u64 index = be32_to_cpu(rx->q_resources->db_index); 307 308 iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); 309 } 310 311 int gve_rx_alloc_rings_dqo(struct gve_priv *priv, 312 struct gve_rx_alloc_rings_cfg *cfg) 313 { 314 struct gve_rx_ring *rx; 315 int err; 316 int i; 317 318 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring), 319 GFP_KERNEL); 320 if (!rx) 321 return -ENOMEM; 322 323 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) { 324 err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i); 325 if (err) { 326 netif_err(priv, drv, priv->dev, 327 "Failed to alloc rx ring=%d: err=%d\n", 328 i, err); 329 goto err; 330 } 331 } 332 333 cfg->rx = rx; 334 return 0; 335 336 err: 337 for (i--; i >= 0; i--) 338 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 339 kvfree(rx); 340 return err; 341 } 342 343 void gve_rx_free_rings_dqo(struct gve_priv *priv, 344 struct gve_rx_alloc_rings_cfg *cfg) 345 { 346 struct gve_rx_ring *rx = cfg->rx; 347 int i; 348 349 if (!rx) 350 return; 351 352 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) 353 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 354 355 kvfree(rx); 356 cfg->rx = NULL; 357 } 358 359 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) 360 { 361 struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; 362 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 363 struct gve_priv *priv = rx->gve; 364 u32 num_avail_slots; 365 u32 num_full_slots; 366 u32 num_posted = 0; 367 368 num_full_slots = (bufq->tail - bufq->head) & bufq->mask; 369 num_avail_slots = bufq->mask - num_full_slots; 370 371 num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); 372 while (num_posted < num_avail_slots) { 373 struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; 374 375 if (unlikely(gve_alloc_buffer(rx, desc))) { 376 u64_stats_update_begin(&rx->statss); 377 rx->rx_buf_alloc_fail++; 378 u64_stats_update_end(&rx->statss); 379 break; 380 } 381 382 if (rx->dqo.hdr_bufs.data) 383 desc->header_buf_addr = 384 cpu_to_le64(rx->dqo.hdr_bufs.addr + 385 priv->header_buf_size * bufq->tail); 386 387 bufq->tail = (bufq->tail + 1) & bufq->mask; 388 complq->num_free_slots--; 389 num_posted++; 390 391 if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) 392 gve_rx_write_doorbell_dqo(priv, rx->q_num); 393 } 394 395 rx->fill_cnt += num_posted; 396 } 397 398 static void gve_rx_skb_csum(struct sk_buff *skb, 399 const struct gve_rx_compl_desc_dqo *desc, 400 struct gve_ptype ptype) 401 { 402 skb->ip_summed = CHECKSUM_NONE; 403 404 /* HW did not identify and process L3 and L4 headers. */ 405 if (unlikely(!desc->l3_l4_processed)) 406 return; 407 408 if (ptype.l3_type == GVE_L3_TYPE_IPV4) { 409 if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) 410 return; 411 } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { 412 /* Checksum should be skipped if this flag is set. */ 413 if (unlikely(desc->ipv6_ex_add)) 414 return; 415 } 416 417 if (unlikely(desc->csum_l4_err)) 418 return; 419 420 switch (ptype.l4_type) { 421 case GVE_L4_TYPE_TCP: 422 case GVE_L4_TYPE_UDP: 423 case GVE_L4_TYPE_ICMP: 424 case GVE_L4_TYPE_SCTP: 425 skb->ip_summed = CHECKSUM_UNNECESSARY; 426 break; 427 default: 428 break; 429 } 430 } 431 432 static void gve_rx_skb_hash(struct sk_buff *skb, 433 const struct gve_rx_compl_desc_dqo *compl_desc, 434 struct gve_ptype ptype) 435 { 436 enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; 437 438 if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) 439 hash_type = PKT_HASH_TYPE_L4; 440 else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) 441 hash_type = PKT_HASH_TYPE_L3; 442 443 skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); 444 } 445 446 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the 447 * skb. 448 * 449 * This algorithm works by using the passed hardware timestamp to generate a 450 * diff relative to the last read of the nic clock. This diff can be positive or 451 * negative, as it is possible that we have read the clock more recently than 452 * the hardware has received this packet. To detect this, we use the high bit of 453 * the diff, and assume that the read is more recent if the high bit is set. In 454 * this case we invert the process. 455 * 456 * Note that this means if the time delta between packet reception and the last 457 * clock read is greater than ~2 seconds, this will provide invalid results. 458 */ 459 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, 460 const struct gve_rx_compl_desc_dqo *desc) 461 { 462 u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter); 463 struct sk_buff *skb = rx->ctx.skb_head; 464 u32 ts, low; 465 s32 diff; 466 467 if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) { 468 ts = le32_to_cpu(desc->ts); 469 low = (u32)last_read; 470 diff = ts - low; 471 skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff); 472 } 473 } 474 475 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx) 476 { 477 if (!rx->ctx.skb_head) 478 return; 479 480 if (rx->ctx.skb_head == napi->skb) 481 napi->skb = NULL; 482 dev_kfree_skb_any(rx->ctx.skb_head); 483 rx->ctx.skb_head = NULL; 484 rx->ctx.skb_tail = NULL; 485 } 486 487 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx) 488 { 489 if (!rx->dqo.qpl) 490 return false; 491 if (rx->dqo.used_buf_states_cnt < 492 (rx->dqo.num_buf_states - 493 GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD)) 494 return false; 495 return true; 496 } 497 498 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx, 499 struct gve_rx_buf_state_dqo *buf_state, 500 u16 buf_len) 501 { 502 struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0); 503 int num_frags; 504 505 if (!page) 506 return -ENOMEM; 507 508 memcpy(page_address(page), 509 buf_state->page_info.page_address + 510 buf_state->page_info.page_offset, 511 buf_len); 512 num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 513 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page, 514 0, buf_len, PAGE_SIZE); 515 516 u64_stats_update_begin(&rx->statss); 517 rx->rx_frag_alloc_cnt++; 518 u64_stats_update_end(&rx->statss); 519 /* Return unused buffer. */ 520 gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); 521 return 0; 522 } 523 524 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx, 525 struct gve_rx_buf_state_dqo *buf_state, 526 int num_frags, u16 buf_len) 527 { 528 if (rx->dqo.page_pool) { 529 skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags, 530 buf_state->page_info.netmem, 531 buf_state->page_info.page_offset + 532 buf_state->page_info.pad, buf_len, 533 buf_state->page_info.buf_size); 534 } else { 535 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, 536 buf_state->page_info.page, 537 buf_state->page_info.page_offset + 538 buf_state->page_info.pad, buf_len, 539 buf_state->page_info.buf_size); 540 } 541 } 542 543 /* Chains multi skbs for single rx packet. 544 * Returns 0 if buffer is appended, -1 otherwise. 545 */ 546 static int gve_rx_append_frags(struct napi_struct *napi, 547 struct gve_rx_buf_state_dqo *buf_state, 548 u16 buf_len, struct gve_rx_ring *rx, 549 struct gve_priv *priv) 550 { 551 int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 552 553 if (unlikely(num_frags == MAX_SKB_FRAGS)) { 554 struct sk_buff *skb; 555 556 skb = napi_alloc_skb(napi, 0); 557 if (!skb) 558 return -1; 559 560 if (rx->dqo.page_pool) 561 skb_mark_for_recycle(skb); 562 563 if (rx->ctx.skb_tail == rx->ctx.skb_head) 564 skb_shinfo(rx->ctx.skb_head)->frag_list = skb; 565 else 566 rx->ctx.skb_tail->next = skb; 567 rx->ctx.skb_tail = skb; 568 num_frags = 0; 569 } 570 if (rx->ctx.skb_tail != rx->ctx.skb_head) { 571 rx->ctx.skb_head->len += buf_len; 572 rx->ctx.skb_head->data_len += buf_len; 573 rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; 574 } 575 576 /* Trigger ondemand page allocation if we are running low on buffers */ 577 if (gve_rx_should_trigger_copy_ondemand(rx)) 578 return gve_rx_copy_ondemand(rx, buf_state, buf_len); 579 580 gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len); 581 gve_reuse_buffer(rx, buf_state); 582 return 0; 583 } 584 585 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 586 struct xdp_buff *xdp) 587 { 588 struct gve_tx_ring *tx; 589 struct xdp_frame *xdpf; 590 u32 tx_qid; 591 int err; 592 593 xdpf = xdp_convert_buff_to_frame(xdp); 594 if (unlikely(!xdpf)) { 595 if (rx->xsk_pool) 596 xsk_buff_free(xdp); 597 return -ENOSPC; 598 } 599 600 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 601 tx = &priv->tx[tx_qid]; 602 spin_lock(&tx->dqo_tx.xdp_lock); 603 err = gve_xdp_xmit_one_dqo(priv, tx, xdpf); 604 spin_unlock(&tx->dqo_tx.xdp_lock); 605 606 return err; 607 } 608 609 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 610 struct xdp_buff *xdp, struct bpf_prog *xprog, 611 int xdp_act) 612 { 613 switch (xdp_act) { 614 case XDP_ABORTED: 615 case XDP_DROP: 616 default: 617 xsk_buff_free(xdp); 618 break; 619 case XDP_TX: 620 if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp))) 621 goto err; 622 break; 623 case XDP_REDIRECT: 624 if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog))) 625 goto err; 626 break; 627 } 628 629 u64_stats_update_begin(&rx->statss); 630 if ((u32)xdp_act < GVE_XDP_ACTIONS) 631 rx->xdp_actions[xdp_act]++; 632 u64_stats_update_end(&rx->statss); 633 return; 634 635 err: 636 u64_stats_update_begin(&rx->statss); 637 if (xdp_act == XDP_TX) 638 rx->xdp_tx_errors++; 639 if (xdp_act == XDP_REDIRECT) 640 rx->xdp_redirect_errors++; 641 u64_stats_update_end(&rx->statss); 642 } 643 644 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 645 struct xdp_buff *xdp, struct bpf_prog *xprog, 646 int xdp_act, 647 struct gve_rx_buf_state_dqo *buf_state) 648 { 649 int err; 650 switch (xdp_act) { 651 case XDP_ABORTED: 652 case XDP_DROP: 653 default: 654 gve_free_buffer(rx, buf_state); 655 break; 656 case XDP_TX: 657 err = gve_xdp_tx_dqo(priv, rx, xdp); 658 if (unlikely(err)) 659 goto err; 660 gve_reuse_buffer(rx, buf_state); 661 break; 662 case XDP_REDIRECT: 663 err = xdp_do_redirect(priv->dev, xdp, xprog); 664 if (unlikely(err)) 665 goto err; 666 gve_reuse_buffer(rx, buf_state); 667 break; 668 } 669 u64_stats_update_begin(&rx->statss); 670 if ((u32)xdp_act < GVE_XDP_ACTIONS) 671 rx->xdp_actions[xdp_act]++; 672 u64_stats_update_end(&rx->statss); 673 return; 674 err: 675 u64_stats_update_begin(&rx->statss); 676 if (xdp_act == XDP_TX) 677 rx->xdp_tx_errors++; 678 else if (xdp_act == XDP_REDIRECT) 679 rx->xdp_redirect_errors++; 680 u64_stats_update_end(&rx->statss); 681 gve_free_buffer(rx, buf_state); 682 return; 683 } 684 685 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 686 struct gve_rx_buf_state_dqo *buf_state, int buf_len, 687 struct bpf_prog *xprog) 688 { 689 struct xdp_buff *xdp = buf_state->xsk_buff; 690 struct gve_priv *priv = rx->gve; 691 int xdp_act; 692 693 xdp->data_end = xdp->data + buf_len; 694 xsk_buff_dma_sync_for_cpu(xdp); 695 696 if (xprog) { 697 xdp_act = bpf_prog_run_xdp(xprog, xdp); 698 buf_len = xdp->data_end - xdp->data; 699 if (xdp_act != XDP_PASS) { 700 gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act); 701 gve_free_buf_state(rx, buf_state); 702 return 0; 703 } 704 } 705 706 /* Copy the data to skb */ 707 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 708 xdp->data, buf_len); 709 if (unlikely(!rx->ctx.skb_head)) { 710 xsk_buff_free(xdp); 711 gve_free_buf_state(rx, buf_state); 712 return -ENOMEM; 713 } 714 rx->ctx.skb_tail = rx->ctx.skb_head; 715 716 /* Free XSK buffer and Buffer state */ 717 xsk_buff_free(xdp); 718 gve_free_buf_state(rx, buf_state); 719 720 /* Update Stats */ 721 u64_stats_update_begin(&rx->statss); 722 rx->xdp_actions[XDP_PASS]++; 723 u64_stats_update_end(&rx->statss); 724 return 0; 725 } 726 727 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx, 728 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len) 729 { 730 struct gve_rx_slot_page_info *page_info = &buf_state->page_info; 731 732 if (rx->dqo.page_pool) { 733 page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool, 734 page_info->netmem, 735 page_info->page_offset, 736 buf_len); 737 } else { 738 dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, 739 page_info->page_offset + 740 page_info->pad, 741 buf_len, DMA_FROM_DEVICE); 742 } 743 } 744 745 /* Returns 0 if descriptor is completed successfully. 746 * Returns -EINVAL if descriptor is invalid. 747 * Returns -ENOMEM if data cannot be copied to skb. 748 */ 749 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 750 const struct gve_rx_compl_desc_dqo *compl_desc, 751 u32 desc_idx, int queue_idx) 752 { 753 const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); 754 const bool hbo = compl_desc->header_buffer_overflow; 755 const bool eop = compl_desc->end_of_packet != 0; 756 const bool hsplit = compl_desc->split_header; 757 struct gve_rx_buf_state_dqo *buf_state; 758 struct gve_priv *priv = rx->gve; 759 struct bpf_prog *xprog; 760 u16 buf_len; 761 u16 hdr_len; 762 763 if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { 764 net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", 765 priv->dev->name, buffer_id); 766 return -EINVAL; 767 } 768 buf_state = &rx->dqo.buf_states[buffer_id]; 769 if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { 770 net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", 771 priv->dev->name, buffer_id); 772 return -EINVAL; 773 } 774 775 if (unlikely(compl_desc->rx_error)) { 776 gve_free_buffer(rx, buf_state); 777 return -EINVAL; 778 } 779 780 buf_len = compl_desc->packet_len; 781 hdr_len = compl_desc->header_len; 782 783 xprog = READ_ONCE(priv->xdp_prog); 784 if (buf_state->xsk_buff) 785 return gve_rx_xsk_dqo(napi, rx, buf_state, buf_len, xprog); 786 787 /* Page might have not been used for awhile and was likely last written 788 * by a different thread. 789 */ 790 if (rx->dqo.page_pool) { 791 if (!netmem_is_net_iov(buf_state->page_info.netmem)) 792 prefetch(netmem_to_page(buf_state->page_info.netmem)); 793 } else { 794 prefetch(buf_state->page_info.page); 795 } 796 797 /* Copy the header into the skb in the case of header split */ 798 if (hsplit) { 799 int unsplit = 0; 800 801 if (hdr_len && !hbo) { 802 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 803 rx->dqo.hdr_bufs.data + 804 desc_idx * priv->header_buf_size, 805 hdr_len); 806 if (unlikely(!rx->ctx.skb_head)) 807 goto error; 808 rx->ctx.skb_tail = rx->ctx.skb_head; 809 810 if (rx->dqo.page_pool) 811 skb_mark_for_recycle(rx->ctx.skb_head); 812 } else { 813 unsplit = 1; 814 } 815 u64_stats_update_begin(&rx->statss); 816 rx->rx_hsplit_pkt++; 817 rx->rx_hsplit_unsplit_pkt += unsplit; 818 rx->rx_hsplit_bytes += hdr_len; 819 u64_stats_update_end(&rx->statss); 820 } else if (!rx->ctx.skb_head && rx->dqo.page_pool && 821 netmem_is_net_iov(buf_state->page_info.netmem)) { 822 /* when header split is disabled, the header went to the packet 823 * buffer. If the packet buffer is a net_iov, those can't be 824 * easily mapped into the kernel space to access the header 825 * required to process the packet. 826 */ 827 goto error; 828 } 829 830 /* Sync the portion of dma buffer for CPU to read. */ 831 gve_dma_sync(priv, rx, buf_state, buf_len); 832 833 /* Append to current skb if one exists. */ 834 if (rx->ctx.skb_head) { 835 if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, 836 priv)) != 0) { 837 goto error; 838 } 839 return 0; 840 } 841 842 if (xprog) { 843 struct xdp_buff xdp; 844 void *old_data; 845 int xdp_act; 846 847 xdp_init_buff(&xdp, buf_state->page_info.buf_size, 848 &rx->xdp_rxq); 849 xdp_prepare_buff(&xdp, 850 buf_state->page_info.page_address + 851 buf_state->page_info.page_offset, 852 buf_state->page_info.pad, 853 buf_len, false); 854 old_data = xdp.data; 855 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 856 buf_state->page_info.pad += xdp.data - old_data; 857 buf_len = xdp.data_end - xdp.data; 858 if (xdp_act != XDP_PASS) { 859 gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act, 860 buf_state); 861 return 0; 862 } 863 864 u64_stats_update_begin(&rx->statss); 865 rx->xdp_actions[XDP_PASS]++; 866 u64_stats_update_end(&rx->statss); 867 } 868 869 if (eop && buf_len <= priv->rx_copybreak && 870 !(rx->dqo.page_pool && 871 netmem_is_net_iov(buf_state->page_info.netmem))) { 872 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, 873 &buf_state->page_info, buf_len); 874 if (unlikely(!rx->ctx.skb_head)) 875 goto error; 876 rx->ctx.skb_tail = rx->ctx.skb_head; 877 878 u64_stats_update_begin(&rx->statss); 879 rx->rx_copied_pkt++; 880 rx->rx_copybreak_pkt++; 881 u64_stats_update_end(&rx->statss); 882 883 gve_free_buffer(rx, buf_state); 884 return 0; 885 } 886 887 rx->ctx.skb_head = napi_get_frags(napi); 888 if (unlikely(!rx->ctx.skb_head)) 889 goto error; 890 rx->ctx.skb_tail = rx->ctx.skb_head; 891 892 if (gve_rx_should_trigger_copy_ondemand(rx)) { 893 if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0) 894 goto error; 895 return 0; 896 } 897 898 if (rx->dqo.page_pool) 899 skb_mark_for_recycle(rx->ctx.skb_head); 900 901 gve_skb_add_rx_frag(rx, buf_state, 0, buf_len); 902 gve_reuse_buffer(rx, buf_state); 903 return 0; 904 905 error: 906 gve_free_buffer(rx, buf_state); 907 return -ENOMEM; 908 } 909 910 static int gve_rx_complete_rsc(struct sk_buff *skb, 911 const struct gve_rx_compl_desc_dqo *desc, 912 struct gve_ptype ptype) 913 { 914 struct skb_shared_info *shinfo = skb_shinfo(skb); 915 916 /* Only TCP is supported right now. */ 917 if (ptype.l4_type != GVE_L4_TYPE_TCP) 918 return -EINVAL; 919 920 switch (ptype.l3_type) { 921 case GVE_L3_TYPE_IPV4: 922 shinfo->gso_type = SKB_GSO_TCPV4; 923 break; 924 case GVE_L3_TYPE_IPV6: 925 shinfo->gso_type = SKB_GSO_TCPV6; 926 break; 927 default: 928 return -EINVAL; 929 } 930 931 shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); 932 return 0; 933 } 934 935 /* Returns 0 if skb is completed successfully, -1 otherwise. */ 936 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, 937 const struct gve_rx_compl_desc_dqo *desc, 938 netdev_features_t feat) 939 { 940 struct gve_ptype ptype = 941 rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; 942 int err; 943 944 skb_record_rx_queue(rx->ctx.skb_head, rx->q_num); 945 946 if (feat & NETIF_F_RXHASH) 947 gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype); 948 949 if (feat & NETIF_F_RXCSUM) 950 gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype); 951 952 if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL) 953 gve_rx_skb_hwtstamp(rx, desc); 954 955 /* RSC packets must set gso_size otherwise the TCP stack will complain 956 * that packets are larger than MTU. 957 */ 958 if (desc->rsc) { 959 err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype); 960 if (err < 0) 961 return err; 962 } 963 964 if (skb_headlen(rx->ctx.skb_head) == 0) 965 napi_gro_frags(napi); 966 else 967 napi_gro_receive(napi, rx->ctx.skb_head); 968 969 return 0; 970 } 971 972 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) 973 { 974 struct gve_rx_compl_queue_dqo *complq; 975 struct napi_struct *napi; 976 netdev_features_t feat; 977 struct gve_rx_ring *rx; 978 struct gve_priv *priv; 979 u64 xdp_redirects; 980 u32 work_done = 0; 981 u64 bytes = 0; 982 u64 xdp_txs; 983 int err; 984 985 napi = &block->napi; 986 feat = napi->dev->features; 987 988 rx = block->rx; 989 priv = rx->gve; 990 complq = &rx->dqo.complq; 991 992 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 993 xdp_txs = rx->xdp_actions[XDP_TX]; 994 995 while (work_done < budget) { 996 struct gve_rx_compl_desc_dqo *compl_desc = 997 &complq->desc_ring[complq->head]; 998 u32 pkt_bytes; 999 1000 /* No more new packets */ 1001 if (compl_desc->generation == complq->cur_gen_bit) 1002 break; 1003 1004 /* Prefetch the next two descriptors. */ 1005 prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); 1006 prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); 1007 1008 /* Do not read data until we own the descriptor */ 1009 dma_rmb(); 1010 1011 err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num); 1012 if (err < 0) { 1013 gve_rx_free_skb(napi, rx); 1014 u64_stats_update_begin(&rx->statss); 1015 if (err == -ENOMEM) 1016 rx->rx_skb_alloc_fail++; 1017 else if (err == -EINVAL) 1018 rx->rx_desc_err_dropped_pkt++; 1019 u64_stats_update_end(&rx->statss); 1020 } 1021 1022 complq->head = (complq->head + 1) & complq->mask; 1023 complq->num_free_slots++; 1024 1025 /* When the ring wraps, the generation bit is flipped. */ 1026 complq->cur_gen_bit ^= (complq->head == 0); 1027 1028 /* Receiving a completion means we have space to post another 1029 * buffer on the buffer queue. 1030 */ 1031 { 1032 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 1033 1034 bufq->head = (bufq->head + 1) & bufq->mask; 1035 } 1036 1037 /* Free running counter of completed descriptors */ 1038 rx->cnt++; 1039 1040 if (!rx->ctx.skb_head) 1041 continue; 1042 1043 if (!compl_desc->end_of_packet) 1044 continue; 1045 1046 work_done++; 1047 pkt_bytes = rx->ctx.skb_head->len; 1048 /* The ethernet header (first ETH_HLEN bytes) is snipped off 1049 * by eth_type_trans. 1050 */ 1051 if (skb_headlen(rx->ctx.skb_head)) 1052 pkt_bytes += ETH_HLEN; 1053 1054 /* gve_rx_complete_skb() will consume skb if successful */ 1055 if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { 1056 gve_rx_free_skb(napi, rx); 1057 u64_stats_update_begin(&rx->statss); 1058 rx->rx_desc_err_dropped_pkt++; 1059 u64_stats_update_end(&rx->statss); 1060 continue; 1061 } 1062 1063 bytes += pkt_bytes; 1064 rx->ctx.skb_head = NULL; 1065 rx->ctx.skb_tail = NULL; 1066 } 1067 1068 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1069 gve_xdp_tx_flush_dqo(priv, rx->q_num); 1070 1071 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1072 xdp_do_flush(); 1073 1074 gve_rx_post_buffers_dqo(rx); 1075 1076 u64_stats_update_begin(&rx->statss); 1077 rx->rpackets += work_done; 1078 rx->rbytes += bytes; 1079 u64_stats_update_end(&rx->statss); 1080 1081 return work_done; 1082 } 1083