1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_dqo.h" 9 #include "gve_adminq.h" 10 #include "gve_utils.h" 11 #include <linux/bpf.h> 12 #include <linux/ip.h> 13 #include <linux/ipv6.h> 14 #include <linux/skbuff.h> 15 #include <linux/slab.h> 16 #include <net/ip6_checksum.h> 17 #include <net/ipv6.h> 18 #include <net/tcp.h> 19 #include <net/xdp_sock_drv.h> 20 21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) 22 { 23 struct device *hdev = &priv->pdev->dev; 24 int buf_count = rx->dqo.bufq.mask + 1; 25 26 if (rx->dqo.hdr_bufs.data) { 27 dma_free_coherent(hdev, priv->header_buf_size * buf_count, 28 rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr); 29 rx->dqo.hdr_bufs.data = NULL; 30 } 31 } 32 33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx, 34 const u32 buffer_queue_slots, 35 const u32 completion_queue_slots) 36 { 37 int i; 38 39 /* Set buffer queue state */ 40 rx->dqo.bufq.mask = buffer_queue_slots - 1; 41 rx->dqo.bufq.head = 0; 42 rx->dqo.bufq.tail = 0; 43 44 /* Set completion queue state */ 45 rx->dqo.complq.num_free_slots = completion_queue_slots; 46 rx->dqo.complq.mask = completion_queue_slots - 1; 47 rx->dqo.complq.cur_gen_bit = 0; 48 rx->dqo.complq.head = 0; 49 50 /* Set RX SKB context */ 51 rx->ctx.skb_head = NULL; 52 rx->ctx.skb_tail = NULL; 53 54 /* Set up linked list of buffer IDs */ 55 if (rx->dqo.buf_states) { 56 for (i = 0; i < rx->dqo.num_buf_states - 1; i++) 57 rx->dqo.buf_states[i].next = i + 1; 58 rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; 59 } 60 61 rx->dqo.free_buf_states = 0; 62 rx->dqo.recycled_buf_states.head = -1; 63 rx->dqo.recycled_buf_states.tail = -1; 64 rx->dqo.used_buf_states.head = -1; 65 rx->dqo.used_buf_states.tail = -1; 66 } 67 68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) 69 { 70 struct gve_rx_ring *rx = &priv->rx[idx]; 71 size_t size; 72 int i; 73 74 const u32 buffer_queue_slots = priv->rx_desc_cnt; 75 const u32 completion_queue_slots = priv->rx_desc_cnt; 76 77 /* Reset buffer queue */ 78 if (rx->dqo.bufq.desc_ring) { 79 size = sizeof(rx->dqo.bufq.desc_ring[0]) * 80 buffer_queue_slots; 81 memset(rx->dqo.bufq.desc_ring, 0, size); 82 } 83 84 /* Reset completion queue */ 85 if (rx->dqo.complq.desc_ring) { 86 size = sizeof(rx->dqo.complq.desc_ring[0]) * 87 completion_queue_slots; 88 memset(rx->dqo.complq.desc_ring, 0, size); 89 } 90 91 /* Reset q_resources */ 92 if (rx->q_resources) 93 memset(rx->q_resources, 0, sizeof(*rx->q_resources)); 94 95 /* Reset buf states */ 96 if (rx->dqo.buf_states) { 97 for (i = 0; i < rx->dqo.num_buf_states; i++) { 98 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 99 100 if (rx->dqo.page_pool) 101 gve_free_to_page_pool(rx, bs, false); 102 else 103 gve_free_qpl_page_dqo(bs); 104 } 105 } 106 107 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 108 completion_queue_slots); 109 } 110 111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) 112 { 113 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 114 struct gve_rx_ring *rx = &priv->rx[idx]; 115 116 if (!gve_rx_was_added_to_block(priv, idx)) 117 return; 118 119 if (rx->dqo.page_pool) 120 page_pool_disable_direct_recycling(rx->dqo.page_pool); 121 gve_remove_napi(priv, ntfy_idx); 122 gve_rx_remove_from_block(priv, idx); 123 gve_rx_reset_ring_dqo(priv, idx); 124 } 125 126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 127 struct gve_rx_alloc_rings_cfg *cfg) 128 { 129 struct device *hdev = &priv->pdev->dev; 130 size_t completion_queue_slots; 131 size_t buffer_queue_slots; 132 int idx = rx->q_num; 133 size_t size; 134 u32 qpl_id; 135 int i; 136 137 completion_queue_slots = rx->dqo.complq.mask + 1; 138 buffer_queue_slots = rx->dqo.bufq.mask + 1; 139 140 if (rx->q_resources) { 141 dma_free_coherent(hdev, sizeof(*rx->q_resources), 142 rx->q_resources, rx->q_resources_bus); 143 rx->q_resources = NULL; 144 } 145 146 for (i = 0; i < rx->dqo.num_buf_states; i++) { 147 struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; 148 149 if (rx->dqo.page_pool) 150 gve_free_to_page_pool(rx, bs, false); 151 else 152 gve_free_qpl_page_dqo(bs); 153 if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) { 154 xsk_buff_free(bs->xsk_buff); 155 bs->xsk_buff = NULL; 156 } 157 } 158 159 if (rx->dqo.qpl) { 160 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 161 gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id); 162 rx->dqo.qpl = NULL; 163 } 164 165 if (rx->dqo.bufq.desc_ring) { 166 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 167 dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, 168 rx->dqo.bufq.bus); 169 rx->dqo.bufq.desc_ring = NULL; 170 } 171 172 if (rx->dqo.complq.desc_ring) { 173 size = sizeof(rx->dqo.complq.desc_ring[0]) * 174 completion_queue_slots; 175 dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, 176 rx->dqo.complq.bus); 177 rx->dqo.complq.desc_ring = NULL; 178 } 179 180 kvfree(rx->dqo.buf_states); 181 rx->dqo.buf_states = NULL; 182 183 if (rx->dqo.page_pool) { 184 page_pool_destroy(rx->dqo.page_pool); 185 rx->dqo.page_pool = NULL; 186 } 187 188 gve_rx_free_hdr_bufs(priv, rx); 189 190 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 191 } 192 193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx, 194 const u32 buf_count) 195 { 196 struct device *hdev = &priv->pdev->dev; 197 198 rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count, 199 &rx->dqo.hdr_bufs.addr, GFP_KERNEL); 200 if (!rx->dqo.hdr_bufs.data) 201 return -ENOMEM; 202 203 return 0; 204 } 205 206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx) 207 { 208 int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); 209 210 gve_rx_add_to_block(priv, idx); 211 gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo); 212 } 213 214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv, 215 struct gve_rx_alloc_rings_cfg *cfg, 216 struct gve_rx_ring *rx, 217 int idx) 218 { 219 struct device *hdev = &priv->pdev->dev; 220 struct page_pool *pool; 221 int qpl_page_cnt; 222 size_t size; 223 u32 qpl_id; 224 225 const u32 buffer_queue_slots = cfg->ring_size; 226 const u32 completion_queue_slots = cfg->ring_size; 227 228 netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); 229 230 memset(rx, 0, sizeof(*rx)); 231 rx->gve = priv; 232 rx->q_num = idx; 233 rx->packet_buffer_size = cfg->packet_buffer_size; 234 235 if (cfg->xdp) { 236 rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO; 237 rx->rx_headroom = XDP_PACKET_HEADROOM; 238 } else { 239 rx->packet_buffer_truesize = rx->packet_buffer_size; 240 rx->rx_headroom = 0; 241 } 242 243 /* struct gve_xdp_buff is overlaid on struct xdp_buff_xsk and utilizes 244 * the 24 byte field cb to store gve specific data. 245 */ 246 XSK_CHECK_PRIV_TYPE(struct gve_xdp_buff); 247 248 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : 249 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 250 rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states, 251 sizeof(rx->dqo.buf_states[0]), 252 GFP_KERNEL, priv->numa_node); 253 if (!rx->dqo.buf_states) 254 return -ENOMEM; 255 256 /* Allocate header buffers for header-split */ 257 if (cfg->enable_header_split) 258 if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots)) 259 goto err; 260 261 /* Allocate RX completion queue */ 262 size = sizeof(rx->dqo.complq.desc_ring[0]) * 263 completion_queue_slots; 264 rx->dqo.complq.desc_ring = 265 dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); 266 if (!rx->dqo.complq.desc_ring) 267 goto err; 268 269 /* Allocate RX buffer queue */ 270 size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; 271 rx->dqo.bufq.desc_ring = 272 dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); 273 if (!rx->dqo.bufq.desc_ring) 274 goto err; 275 276 if (cfg->raw_addressing) { 277 pool = gve_rx_create_page_pool(priv, rx, cfg->xdp); 278 if (IS_ERR(pool)) 279 goto err; 280 281 rx->dqo.page_pool = pool; 282 } else { 283 qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); 284 qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 285 286 rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id, 287 qpl_page_cnt); 288 if (!rx->dqo.qpl) 289 goto err; 290 rx->dqo.next_qpl_page_idx = 0; 291 } 292 293 rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), 294 &rx->q_resources_bus, GFP_KERNEL); 295 if (!rx->q_resources) 296 goto err; 297 298 gve_rx_init_ring_state_dqo(rx, buffer_queue_slots, 299 completion_queue_slots); 300 301 return 0; 302 303 err: 304 gve_rx_free_ring_dqo(priv, rx, cfg); 305 return -ENOMEM; 306 } 307 308 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) 309 { 310 const struct gve_rx_ring *rx = &priv->rx[queue_idx]; 311 u64 index = be32_to_cpu(rx->q_resources->db_index); 312 313 iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); 314 } 315 316 int gve_rx_alloc_rings_dqo(struct gve_priv *priv, 317 struct gve_rx_alloc_rings_cfg *cfg) 318 { 319 struct gve_rx_ring *rx; 320 int err; 321 int i; 322 323 rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring), 324 GFP_KERNEL); 325 if (!rx) 326 return -ENOMEM; 327 328 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) { 329 err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i); 330 if (err) { 331 netif_err(priv, drv, priv->dev, 332 "Failed to alloc rx ring=%d: err=%d\n", 333 i, err); 334 goto err; 335 } 336 } 337 338 cfg->rx = rx; 339 return 0; 340 341 err: 342 for (i--; i >= 0; i--) 343 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 344 kvfree(rx); 345 return err; 346 } 347 348 void gve_rx_free_rings_dqo(struct gve_priv *priv, 349 struct gve_rx_alloc_rings_cfg *cfg) 350 { 351 struct gve_rx_ring *rx = cfg->rx; 352 int i; 353 354 if (!rx) 355 return; 356 357 for (i = 0; i < cfg->qcfg_rx->num_queues; i++) 358 gve_rx_free_ring_dqo(priv, &rx[i], cfg); 359 360 kvfree(rx); 361 cfg->rx = NULL; 362 } 363 364 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) 365 { 366 struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; 367 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 368 struct gve_priv *priv = rx->gve; 369 u32 num_avail_slots; 370 u32 num_full_slots; 371 u32 num_posted = 0; 372 373 num_full_slots = (bufq->tail - bufq->head) & bufq->mask; 374 num_avail_slots = bufq->mask - num_full_slots; 375 376 num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); 377 while (num_posted < num_avail_slots) { 378 struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; 379 380 if (unlikely(gve_alloc_buffer(rx, desc))) { 381 u64_stats_update_begin(&rx->statss); 382 rx->rx_buf_alloc_fail++; 383 u64_stats_update_end(&rx->statss); 384 break; 385 } 386 387 if (rx->dqo.hdr_bufs.data) 388 desc->header_buf_addr = 389 cpu_to_le64(rx->dqo.hdr_bufs.addr + 390 priv->header_buf_size * bufq->tail); 391 392 bufq->tail = (bufq->tail + 1) & bufq->mask; 393 complq->num_free_slots--; 394 num_posted++; 395 396 if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) 397 gve_rx_write_doorbell_dqo(priv, rx->q_num); 398 } 399 400 rx->fill_cnt += num_posted; 401 } 402 403 static void gve_rx_skb_csum(struct sk_buff *skb, 404 const struct gve_rx_compl_desc_dqo *desc, 405 struct gve_ptype ptype) 406 { 407 skb->ip_summed = CHECKSUM_NONE; 408 409 /* HW did not identify and process L3 and L4 headers. */ 410 if (unlikely(!desc->l3_l4_processed)) 411 return; 412 413 if (ptype.l3_type == GVE_L3_TYPE_IPV4) { 414 if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) 415 return; 416 } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { 417 /* Checksum should be skipped if this flag is set. */ 418 if (unlikely(desc->ipv6_ex_add)) 419 return; 420 } 421 422 if (unlikely(desc->csum_l4_err)) 423 return; 424 425 switch (ptype.l4_type) { 426 case GVE_L4_TYPE_TCP: 427 case GVE_L4_TYPE_UDP: 428 case GVE_L4_TYPE_ICMP: 429 case GVE_L4_TYPE_SCTP: 430 skb->ip_summed = CHECKSUM_UNNECESSARY; 431 break; 432 default: 433 break; 434 } 435 } 436 437 static void gve_rx_skb_hash(struct sk_buff *skb, 438 const struct gve_rx_compl_desc_dqo *compl_desc, 439 struct gve_ptype ptype) 440 { 441 enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; 442 443 if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) 444 hash_type = PKT_HASH_TYPE_L4; 445 else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) 446 hash_type = PKT_HASH_TYPE_L3; 447 448 skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); 449 } 450 451 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the 452 * skb. 453 * 454 * This algorithm works by using the passed hardware timestamp to generate a 455 * diff relative to the last read of the nic clock. This diff can be positive or 456 * negative, as it is possible that we have read the clock more recently than 457 * the hardware has received this packet. To detect this, we use the high bit of 458 * the diff, and assume that the read is more recent if the high bit is set. In 459 * this case we invert the process. 460 * 461 * Note that this means if the time delta between packet reception and the last 462 * clock read is greater than ~2 seconds, this will provide invalid results. 463 */ 464 static ktime_t gve_rx_get_hwtstamp(struct gve_priv *gve, u32 hwts) 465 { 466 u64 last_read = READ_ONCE(gve->last_sync_nic_counter); 467 u32 low = (u32)last_read; 468 s32 diff = hwts - low; 469 470 return ns_to_ktime(last_read + diff); 471 } 472 473 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, 474 const struct gve_rx_compl_desc_dqo *desc) 475 { 476 struct sk_buff *skb = rx->ctx.skb_head; 477 478 if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) 479 skb_hwtstamps(skb)->hwtstamp = 480 gve_rx_get_hwtstamp(rx->gve, le32_to_cpu(desc->ts)); 481 } 482 483 int gve_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp) 484 { 485 const struct gve_xdp_buff *ctx = (void *)_ctx; 486 487 if (!ctx->gve->nic_ts_report) 488 return -ENODATA; 489 490 if (!(ctx->compl_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID)) 491 return -ENODATA; 492 493 *timestamp = gve_rx_get_hwtstamp(ctx->gve, 494 le32_to_cpu(ctx->compl_desc->ts)); 495 return 0; 496 } 497 498 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx) 499 { 500 if (!rx->ctx.skb_head) 501 return; 502 503 if (rx->ctx.skb_head == napi->skb) 504 napi->skb = NULL; 505 dev_kfree_skb_any(rx->ctx.skb_head); 506 rx->ctx.skb_head = NULL; 507 rx->ctx.skb_tail = NULL; 508 } 509 510 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx) 511 { 512 if (!rx->dqo.qpl) 513 return false; 514 if (rx->dqo.used_buf_states_cnt < 515 (rx->dqo.num_buf_states - 516 GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD)) 517 return false; 518 return true; 519 } 520 521 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx, 522 struct gve_rx_buf_state_dqo *buf_state, 523 u16 buf_len) 524 { 525 struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0); 526 int num_frags; 527 528 if (!page) 529 return -ENOMEM; 530 531 memcpy(page_address(page), 532 buf_state->page_info.page_address + 533 buf_state->page_info.page_offset, 534 buf_len); 535 num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 536 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page, 537 0, buf_len, PAGE_SIZE); 538 539 u64_stats_update_begin(&rx->statss); 540 rx->rx_frag_alloc_cnt++; 541 u64_stats_update_end(&rx->statss); 542 /* Return unused buffer. */ 543 gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); 544 return 0; 545 } 546 547 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx, 548 struct gve_rx_buf_state_dqo *buf_state, 549 int num_frags, u16 buf_len) 550 { 551 if (rx->dqo.page_pool) { 552 skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags, 553 buf_state->page_info.netmem, 554 buf_state->page_info.page_offset + 555 buf_state->page_info.pad, buf_len, 556 buf_state->page_info.buf_size); 557 } else { 558 skb_add_rx_frag(rx->ctx.skb_tail, num_frags, 559 buf_state->page_info.page, 560 buf_state->page_info.page_offset + 561 buf_state->page_info.pad, buf_len, 562 buf_state->page_info.buf_size); 563 } 564 } 565 566 /* Chains multi skbs for single rx packet. 567 * Returns 0 if buffer is appended, -1 otherwise. 568 */ 569 static int gve_rx_append_frags(struct napi_struct *napi, 570 struct gve_rx_buf_state_dqo *buf_state, 571 u16 buf_len, struct gve_rx_ring *rx, 572 struct gve_priv *priv) 573 { 574 int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; 575 576 if (unlikely(num_frags == MAX_SKB_FRAGS)) { 577 struct sk_buff *skb; 578 579 skb = napi_alloc_skb(napi, 0); 580 if (!skb) 581 return -1; 582 583 if (rx->dqo.page_pool) 584 skb_mark_for_recycle(skb); 585 586 if (rx->ctx.skb_tail == rx->ctx.skb_head) 587 skb_shinfo(rx->ctx.skb_head)->frag_list = skb; 588 else 589 rx->ctx.skb_tail->next = skb; 590 rx->ctx.skb_tail = skb; 591 num_frags = 0; 592 } 593 if (rx->ctx.skb_tail != rx->ctx.skb_head) { 594 rx->ctx.skb_head->len += buf_len; 595 rx->ctx.skb_head->data_len += buf_len; 596 rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; 597 } 598 599 /* Trigger ondemand page allocation if we are running low on buffers */ 600 if (gve_rx_should_trigger_copy_ondemand(rx)) 601 return gve_rx_copy_ondemand(rx, buf_state, buf_len); 602 603 gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len); 604 gve_reuse_buffer(rx, buf_state); 605 return 0; 606 } 607 608 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 609 struct xdp_buff *xdp) 610 { 611 struct gve_tx_ring *tx; 612 struct xdp_frame *xdpf; 613 u32 tx_qid; 614 int err; 615 616 xdpf = xdp_convert_buff_to_frame(xdp); 617 if (unlikely(!xdpf)) { 618 if (rx->xsk_pool) 619 xsk_buff_free(xdp); 620 return -ENOSPC; 621 } 622 623 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 624 tx = &priv->tx[tx_qid]; 625 spin_lock(&tx->dqo_tx.xdp_lock); 626 err = gve_xdp_xmit_one_dqo(priv, tx, xdpf); 627 spin_unlock(&tx->dqo_tx.xdp_lock); 628 629 return err; 630 } 631 632 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 633 struct xdp_buff *xdp, struct bpf_prog *xprog, 634 int xdp_act) 635 { 636 switch (xdp_act) { 637 case XDP_ABORTED: 638 case XDP_DROP: 639 default: 640 xsk_buff_free(xdp); 641 break; 642 case XDP_TX: 643 if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp))) 644 goto err; 645 break; 646 case XDP_REDIRECT: 647 if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog))) 648 goto err; 649 break; 650 } 651 652 u64_stats_update_begin(&rx->statss); 653 if ((u32)xdp_act < GVE_XDP_ACTIONS) 654 rx->xdp_actions[xdp_act]++; 655 u64_stats_update_end(&rx->statss); 656 return; 657 658 err: 659 u64_stats_update_begin(&rx->statss); 660 if (xdp_act == XDP_TX) 661 rx->xdp_tx_errors++; 662 if (xdp_act == XDP_REDIRECT) 663 rx->xdp_redirect_errors++; 664 u64_stats_update_end(&rx->statss); 665 } 666 667 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, 668 struct xdp_buff *xdp, struct bpf_prog *xprog, 669 int xdp_act, 670 struct gve_rx_buf_state_dqo *buf_state) 671 { 672 int err; 673 switch (xdp_act) { 674 case XDP_ABORTED: 675 case XDP_DROP: 676 default: 677 gve_free_buffer(rx, buf_state); 678 break; 679 case XDP_TX: 680 err = gve_xdp_tx_dqo(priv, rx, xdp); 681 if (unlikely(err)) 682 goto err; 683 gve_reuse_buffer(rx, buf_state); 684 break; 685 case XDP_REDIRECT: 686 err = xdp_do_redirect(priv->dev, xdp, xprog); 687 if (unlikely(err)) 688 goto err; 689 gve_reuse_buffer(rx, buf_state); 690 break; 691 } 692 u64_stats_update_begin(&rx->statss); 693 if ((u32)xdp_act < GVE_XDP_ACTIONS) 694 rx->xdp_actions[xdp_act]++; 695 u64_stats_update_end(&rx->statss); 696 return; 697 err: 698 u64_stats_update_begin(&rx->statss); 699 if (xdp_act == XDP_TX) 700 rx->xdp_tx_errors++; 701 else if (xdp_act == XDP_REDIRECT) 702 rx->xdp_redirect_errors++; 703 u64_stats_update_end(&rx->statss); 704 gve_free_buffer(rx, buf_state); 705 return; 706 } 707 708 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 709 const struct gve_rx_compl_desc_dqo *compl_desc, 710 struct gve_rx_buf_state_dqo *buf_state, 711 struct bpf_prog *xprog) 712 { 713 struct xdp_buff *xdp = buf_state->xsk_buff; 714 int buf_len = compl_desc->packet_len; 715 struct gve_priv *priv = rx->gve; 716 struct gve_xdp_buff *gve_xdp; 717 int xdp_act; 718 719 xdp->data_end = xdp->data + buf_len; 720 xsk_buff_dma_sync_for_cpu(xdp); 721 722 gve_xdp = (void *)xdp; 723 gve_xdp->gve = priv; 724 gve_xdp->compl_desc = compl_desc; 725 726 if (xprog) { 727 xdp_act = bpf_prog_run_xdp(xprog, xdp); 728 buf_len = xdp->data_end - xdp->data; 729 if (xdp_act != XDP_PASS) { 730 gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act); 731 gve_free_buf_state(rx, buf_state); 732 return 0; 733 } 734 } 735 736 /* Copy the data to skb */ 737 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 738 xdp->data, buf_len); 739 if (unlikely(!rx->ctx.skb_head)) { 740 xsk_buff_free(xdp); 741 gve_free_buf_state(rx, buf_state); 742 return -ENOMEM; 743 } 744 rx->ctx.skb_tail = rx->ctx.skb_head; 745 746 /* Free XSK buffer and Buffer state */ 747 xsk_buff_free(xdp); 748 gve_free_buf_state(rx, buf_state); 749 750 /* Update Stats */ 751 u64_stats_update_begin(&rx->statss); 752 rx->xdp_actions[XDP_PASS]++; 753 u64_stats_update_end(&rx->statss); 754 return 0; 755 } 756 757 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx, 758 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len) 759 { 760 struct gve_rx_slot_page_info *page_info = &buf_state->page_info; 761 762 if (rx->dqo.page_pool) { 763 page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool, 764 page_info->netmem, 765 page_info->page_offset, 766 buf_len); 767 } else { 768 dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, 769 page_info->page_offset + 770 page_info->pad, 771 buf_len, DMA_FROM_DEVICE); 772 } 773 } 774 775 /* Returns 0 if descriptor is completed successfully. 776 * Returns -EINVAL if descriptor is invalid. 777 * Returns -ENOMEM if data cannot be copied to skb. 778 */ 779 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, 780 const struct gve_rx_compl_desc_dqo *compl_desc, 781 u32 desc_idx, int queue_idx) 782 { 783 const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); 784 const bool hbo = compl_desc->header_buffer_overflow; 785 const bool eop = compl_desc->end_of_packet != 0; 786 const bool hsplit = compl_desc->split_header; 787 struct gve_rx_buf_state_dqo *buf_state; 788 struct gve_priv *priv = rx->gve; 789 struct bpf_prog *xprog; 790 u16 buf_len; 791 u16 hdr_len; 792 793 if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { 794 net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", 795 priv->dev->name, buffer_id); 796 return -EINVAL; 797 } 798 buf_state = &rx->dqo.buf_states[buffer_id]; 799 if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { 800 net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", 801 priv->dev->name, buffer_id); 802 return -EINVAL; 803 } 804 805 if (unlikely(compl_desc->rx_error)) { 806 gve_free_buffer(rx, buf_state); 807 return -EINVAL; 808 } 809 810 buf_len = compl_desc->packet_len; 811 hdr_len = compl_desc->header_len; 812 813 xprog = READ_ONCE(priv->xdp_prog); 814 if (buf_state->xsk_buff) 815 return gve_rx_xsk_dqo(napi, rx, compl_desc, buf_state, xprog); 816 817 /* Page might have not been used for awhile and was likely last written 818 * by a different thread. 819 */ 820 if (rx->dqo.page_pool) { 821 if (!netmem_is_net_iov(buf_state->page_info.netmem)) 822 prefetch(netmem_to_page(buf_state->page_info.netmem)); 823 } else { 824 prefetch(buf_state->page_info.page); 825 } 826 827 /* Copy the header into the skb in the case of header split */ 828 if (hsplit) { 829 int unsplit = 0; 830 831 if (hdr_len && !hbo) { 832 rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi, 833 rx->dqo.hdr_bufs.data + 834 desc_idx * priv->header_buf_size, 835 hdr_len); 836 if (unlikely(!rx->ctx.skb_head)) 837 goto error; 838 rx->ctx.skb_tail = rx->ctx.skb_head; 839 840 if (rx->dqo.page_pool) 841 skb_mark_for_recycle(rx->ctx.skb_head); 842 } else { 843 unsplit = 1; 844 } 845 u64_stats_update_begin(&rx->statss); 846 rx->rx_hsplit_pkt++; 847 rx->rx_hsplit_unsplit_pkt += unsplit; 848 rx->rx_hsplit_bytes += hdr_len; 849 u64_stats_update_end(&rx->statss); 850 } else if (!rx->ctx.skb_head && rx->dqo.page_pool && 851 netmem_is_net_iov(buf_state->page_info.netmem)) { 852 /* when header split is disabled, the header went to the packet 853 * buffer. If the packet buffer is a net_iov, those can't be 854 * easily mapped into the kernel space to access the header 855 * required to process the packet. 856 */ 857 goto error; 858 } 859 860 /* Sync the portion of dma buffer for CPU to read. */ 861 gve_dma_sync(priv, rx, buf_state, buf_len); 862 863 /* Append to current skb if one exists. */ 864 if (rx->ctx.skb_head) { 865 if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, 866 priv)) != 0) { 867 goto error; 868 } 869 return 0; 870 } 871 872 if (xprog) { 873 struct gve_xdp_buff gve_xdp; 874 void *old_data; 875 int xdp_act; 876 877 xdp_init_buff(&gve_xdp.xdp, buf_state->page_info.buf_size, 878 &rx->xdp_rxq); 879 xdp_prepare_buff(&gve_xdp.xdp, 880 buf_state->page_info.page_address + 881 buf_state->page_info.page_offset, 882 buf_state->page_info.pad, 883 buf_len, false); 884 gve_xdp.gve = priv; 885 gve_xdp.compl_desc = compl_desc; 886 887 old_data = gve_xdp.xdp.data; 888 xdp_act = bpf_prog_run_xdp(xprog, &gve_xdp.xdp); 889 buf_state->page_info.pad += gve_xdp.xdp.data - old_data; 890 buf_len = gve_xdp.xdp.data_end - gve_xdp.xdp.data; 891 if (xdp_act != XDP_PASS) { 892 gve_xdp_done_dqo(priv, rx, &gve_xdp.xdp, xprog, xdp_act, 893 buf_state); 894 return 0; 895 } 896 897 u64_stats_update_begin(&rx->statss); 898 rx->xdp_actions[XDP_PASS]++; 899 u64_stats_update_end(&rx->statss); 900 } 901 902 if (eop && buf_len <= priv->rx_copybreak && 903 !(rx->dqo.page_pool && 904 netmem_is_net_iov(buf_state->page_info.netmem))) { 905 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, 906 &buf_state->page_info, buf_len); 907 if (unlikely(!rx->ctx.skb_head)) 908 goto error; 909 rx->ctx.skb_tail = rx->ctx.skb_head; 910 911 u64_stats_update_begin(&rx->statss); 912 rx->rx_copied_pkt++; 913 rx->rx_copybreak_pkt++; 914 u64_stats_update_end(&rx->statss); 915 916 gve_free_buffer(rx, buf_state); 917 return 0; 918 } 919 920 rx->ctx.skb_head = napi_get_frags(napi); 921 if (unlikely(!rx->ctx.skb_head)) 922 goto error; 923 rx->ctx.skb_tail = rx->ctx.skb_head; 924 925 if (gve_rx_should_trigger_copy_ondemand(rx)) { 926 if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0) 927 goto error; 928 return 0; 929 } 930 931 if (rx->dqo.page_pool) 932 skb_mark_for_recycle(rx->ctx.skb_head); 933 934 gve_skb_add_rx_frag(rx, buf_state, 0, buf_len); 935 gve_reuse_buffer(rx, buf_state); 936 return 0; 937 938 error: 939 gve_free_buffer(rx, buf_state); 940 return -ENOMEM; 941 } 942 943 static int gve_rx_complete_rsc(struct sk_buff *skb, 944 const struct gve_rx_compl_desc_dqo *desc, 945 struct gve_ptype ptype) 946 { 947 struct skb_shared_info *shinfo = skb_shinfo(skb); 948 949 /* Only TCP is supported right now. */ 950 if (ptype.l4_type != GVE_L4_TYPE_TCP) 951 return -EINVAL; 952 953 switch (ptype.l3_type) { 954 case GVE_L3_TYPE_IPV4: 955 shinfo->gso_type = SKB_GSO_TCPV4; 956 break; 957 case GVE_L3_TYPE_IPV6: 958 shinfo->gso_type = SKB_GSO_TCPV6; 959 break; 960 default: 961 return -EINVAL; 962 } 963 964 shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); 965 return 0; 966 } 967 968 /* Returns 0 if skb is completed successfully, -1 otherwise. */ 969 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, 970 const struct gve_rx_compl_desc_dqo *desc, 971 netdev_features_t feat) 972 { 973 struct gve_ptype ptype = 974 rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; 975 int err; 976 977 skb_record_rx_queue(rx->ctx.skb_head, rx->q_num); 978 979 if (feat & NETIF_F_RXHASH) 980 gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype); 981 982 if (feat & NETIF_F_RXCSUM) 983 gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype); 984 985 if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL) 986 gve_rx_skb_hwtstamp(rx, desc); 987 988 /* RSC packets must set gso_size otherwise the TCP stack will complain 989 * that packets are larger than MTU. 990 */ 991 if (desc->rsc) { 992 err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype); 993 if (err < 0) 994 return err; 995 } 996 997 if (skb_headlen(rx->ctx.skb_head) == 0) 998 napi_gro_frags(napi); 999 else 1000 napi_gro_receive(napi, rx->ctx.skb_head); 1001 1002 return 0; 1003 } 1004 1005 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) 1006 { 1007 struct gve_rx_compl_queue_dqo *complq; 1008 struct napi_struct *napi; 1009 netdev_features_t feat; 1010 struct gve_rx_ring *rx; 1011 struct gve_priv *priv; 1012 u64 xdp_redirects; 1013 u32 work_done = 0; 1014 u64 bytes = 0; 1015 u64 xdp_txs; 1016 int err; 1017 1018 napi = &block->napi; 1019 feat = napi->dev->features; 1020 1021 rx = block->rx; 1022 priv = rx->gve; 1023 complq = &rx->dqo.complq; 1024 1025 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 1026 xdp_txs = rx->xdp_actions[XDP_TX]; 1027 1028 while (work_done < budget) { 1029 struct gve_rx_compl_desc_dqo *compl_desc = 1030 &complq->desc_ring[complq->head]; 1031 u32 pkt_bytes; 1032 1033 /* No more new packets */ 1034 if (compl_desc->generation == complq->cur_gen_bit) 1035 break; 1036 1037 /* Prefetch the next two descriptors. */ 1038 prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); 1039 prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); 1040 1041 /* Do not read data until we own the descriptor */ 1042 dma_rmb(); 1043 1044 err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num); 1045 if (err < 0) { 1046 gve_rx_free_skb(napi, rx); 1047 u64_stats_update_begin(&rx->statss); 1048 if (err == -ENOMEM) 1049 rx->rx_skb_alloc_fail++; 1050 else if (err == -EINVAL) 1051 rx->rx_desc_err_dropped_pkt++; 1052 u64_stats_update_end(&rx->statss); 1053 } 1054 1055 complq->head = (complq->head + 1) & complq->mask; 1056 complq->num_free_slots++; 1057 1058 /* When the ring wraps, the generation bit is flipped. */ 1059 complq->cur_gen_bit ^= (complq->head == 0); 1060 1061 /* Receiving a completion means we have space to post another 1062 * buffer on the buffer queue. 1063 */ 1064 { 1065 struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; 1066 1067 bufq->head = (bufq->head + 1) & bufq->mask; 1068 } 1069 1070 /* Free running counter of completed descriptors */ 1071 rx->cnt++; 1072 1073 if (!rx->ctx.skb_head) 1074 continue; 1075 1076 if (!compl_desc->end_of_packet) 1077 continue; 1078 1079 work_done++; 1080 pkt_bytes = rx->ctx.skb_head->len; 1081 /* The ethernet header (first ETH_HLEN bytes) is snipped off 1082 * by eth_type_trans. 1083 */ 1084 if (skb_headlen(rx->ctx.skb_head)) 1085 pkt_bytes += ETH_HLEN; 1086 1087 /* gve_rx_complete_skb() will consume skb if successful */ 1088 if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { 1089 gve_rx_free_skb(napi, rx); 1090 u64_stats_update_begin(&rx->statss); 1091 rx->rx_desc_err_dropped_pkt++; 1092 u64_stats_update_end(&rx->statss); 1093 continue; 1094 } 1095 1096 bytes += pkt_bytes; 1097 rx->ctx.skb_head = NULL; 1098 rx->ctx.skb_tail = NULL; 1099 } 1100 1101 if (xdp_txs != rx->xdp_actions[XDP_TX]) 1102 gve_xdp_tx_flush_dqo(priv, rx->q_num); 1103 1104 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 1105 xdp_do_flush(); 1106 1107 gve_rx_post_buffers_dqo(rx); 1108 1109 u64_stats_update_begin(&rx->statss); 1110 rx->rpackets += work_done; 1111 rx->rbytes += bytes; 1112 u64_stats_update_end(&rx->statss); 1113 1114 return work_done; 1115 } 1116