1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <linux/vmalloc.h> 26 27 #include <net/netdev_queues.h> 28 #include <net/xdp_sock_drv.h> 29 #include <net/busy_poll.h> 30 #include <net/netdev_lock.h> 31 #include <net/netdev_rx_queue.h> 32 #include <net/xdp.h> 33 34 #include "../core/dev.h" 35 36 #include "xsk_queue.h" 37 #include "xdp_umem.h" 38 #include "xsk.h" 39 40 #define TX_BATCH_SIZE 32 41 #define MAX_PER_SOCKET_BUDGET 32 42 43 struct xsk_addrs { 44 u32 num_descs; 45 u64 addrs[MAX_SKB_FRAGS + 1]; 46 }; 47 48 static struct kmem_cache *xsk_tx_generic_cache; 49 50 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 51 { 52 if (pool->cached_need_wakeup & XDP_WAKEUP_RX) 53 return; 54 55 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 56 pool->cached_need_wakeup |= XDP_WAKEUP_RX; 57 } 58 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 59 60 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) 61 { 62 struct xdp_sock *xs; 63 64 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 65 return; 66 67 rcu_read_lock(); 68 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 69 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 70 } 71 rcu_read_unlock(); 72 73 pool->cached_need_wakeup |= XDP_WAKEUP_TX; 74 } 75 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 76 77 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) 78 { 79 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) 80 return; 81 82 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 83 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; 84 } 85 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 86 87 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) 88 { 89 struct xdp_sock *xs; 90 91 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) 92 return; 93 94 rcu_read_lock(); 95 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 96 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 97 } 98 rcu_read_unlock(); 99 100 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; 101 } 102 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 103 104 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) 105 { 106 return pool->uses_need_wakeup; 107 } 108 EXPORT_SYMBOL(xsk_uses_need_wakeup); 109 110 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 111 u16 queue_id) 112 { 113 if (queue_id < dev->real_num_rx_queues) 114 return dev->_rx[queue_id].pool; 115 if (queue_id < dev->real_num_tx_queues) 116 return dev->_tx[queue_id].pool; 117 118 return NULL; 119 } 120 EXPORT_SYMBOL(xsk_get_pool_from_qid); 121 122 static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 123 { 124 if (queue_id < dev->num_rx_queues) 125 dev->_rx[queue_id].pool = NULL; 126 if (queue_id < dev->num_tx_queues) 127 dev->_tx[queue_id].pool = NULL; 128 } 129 130 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 131 { 132 struct netdev_rx_queue *hw_rxq; 133 134 if (!netif_rxq_is_leased(dev, queue_id)) 135 return __xsk_clear_pool_at_qid(dev, queue_id); 136 WARN_ON_ONCE(!netif_is_queue_leasee(dev)); 137 138 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 139 140 netdev_lock(hw_rxq->dev); 141 queue_id = get_netdev_rx_queue_index(hw_rxq); 142 __xsk_clear_pool_at_qid(hw_rxq->dev, queue_id); 143 netdev_unlock(hw_rxq->dev); 144 } 145 146 static int __xsk_reg_pool_at_qid(struct net_device *dev, 147 struct xsk_buff_pool *pool, u16 queue_id) 148 { 149 if (xsk_get_pool_from_qid(dev, queue_id)) 150 return -EBUSY; 151 152 if (queue_id < dev->real_num_rx_queues) 153 dev->_rx[queue_id].pool = pool; 154 if (queue_id < dev->real_num_tx_queues) 155 dev->_tx[queue_id].pool = pool; 156 157 return 0; 158 } 159 160 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do 161 * not know if the device has more tx queues than rx, or the opposite. 162 * This might also change during run time. 163 */ 164 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 165 u16 queue_id) 166 { 167 struct netdev_rx_queue *hw_rxq; 168 int ret; 169 170 if (queue_id >= max(dev->real_num_rx_queues, 171 dev->real_num_tx_queues)) 172 return -EINVAL; 173 174 if (queue_id >= dev->real_num_rx_queues || 175 !netif_rxq_is_leased(dev, queue_id)) 176 return __xsk_reg_pool_at_qid(dev, pool, queue_id); 177 if (!netif_is_queue_leasee(dev)) 178 return -EBUSY; 179 180 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 181 182 netdev_lock(hw_rxq->dev); 183 queue_id = get_netdev_rx_queue_index(hw_rxq); 184 ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id); 185 netdev_unlock(hw_rxq->dev); 186 187 return ret; 188 } 189 190 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, 191 u32 flags) 192 { 193 u64 addr; 194 int err; 195 196 addr = xp_get_handle(xskb, xskb->pool); 197 err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 198 if (err) { 199 xs->rx_queue_full++; 200 return err; 201 } 202 203 xp_release(xskb); 204 return 0; 205 } 206 207 static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, 208 u32 len, u32 flags) 209 { 210 u64 addr; 211 212 addr = xp_get_handle(xskb, xskb->pool); 213 __xskq_prod_reserve_desc(xs->rx, addr, len, flags); 214 215 xp_release(xskb); 216 } 217 218 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 219 { 220 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 221 u32 frags = xdp_buff_has_frags(xdp); 222 struct xdp_buff_xsk *pos, *tmp; 223 struct list_head *xskb_list; 224 u32 contd = 0; 225 u32 num_desc; 226 int err; 227 228 if (likely(!frags)) { 229 err = __xsk_rcv_zc(xs, xskb, len, contd); 230 if (err) 231 goto err; 232 return 0; 233 } 234 235 contd = XDP_PKT_CONTD; 236 num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; 237 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 238 xs->rx_queue_full++; 239 err = -ENOBUFS; 240 goto err; 241 } 242 243 __xsk_rcv_zc_safe(xs, xskb, len, contd); 244 xskb_list = &xskb->pool->xskb_list; 245 list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 246 if (list_is_singular(xskb_list)) 247 contd = 0; 248 len = pos->xdp.data_end - pos->xdp.data; 249 __xsk_rcv_zc_safe(xs, pos, len, contd); 250 list_del_init(&pos->list_node); 251 } 252 253 return 0; 254 err: 255 xsk_buff_free(xdp); 256 return err; 257 } 258 259 static void *xsk_copy_xdp_start(struct xdp_buff *from) 260 { 261 if (unlikely(xdp_data_meta_unsupported(from))) 262 return from->data; 263 else 264 return from->data_meta; 265 } 266 267 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, 268 u32 *from_len, skb_frag_t **frag, u32 rem) 269 { 270 u32 copied = 0; 271 272 while (1) { 273 u32 copy_len = min_t(u32, *from_len, to_len); 274 275 memcpy(to, *from, copy_len); 276 copied += copy_len; 277 if (rem == copied) 278 return copied; 279 280 if (*from_len == copy_len) { 281 *from = skb_frag_address(*frag); 282 *from_len = skb_frag_size((*frag)++); 283 } else { 284 *from += copy_len; 285 *from_len -= copy_len; 286 } 287 if (to_len == copy_len) 288 return copied; 289 290 to_len -= copy_len; 291 to += copy_len; 292 } 293 } 294 295 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 296 { 297 u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); 298 void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; 299 u32 from_len, meta_len, rem, num_desc; 300 struct xdp_buff_xsk *xskb; 301 struct xdp_buff *xsk_xdp; 302 skb_frag_t *frag; 303 304 from_len = xdp->data_end - copy_from; 305 meta_len = xdp->data - copy_from; 306 rem = len + meta_len; 307 308 if (len <= frame_size && !xdp_buff_has_frags(xdp)) { 309 int err; 310 311 xsk_xdp = xsk_buff_alloc(xs->pool); 312 if (!xsk_xdp) { 313 xs->rx_dropped++; 314 return -ENOMEM; 315 } 316 memcpy(xsk_xdp->data - meta_len, copy_from, rem); 317 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 318 err = __xsk_rcv_zc(xs, xskb, len, 0); 319 if (err) { 320 xsk_buff_free(xsk_xdp); 321 return err; 322 } 323 324 return 0; 325 } 326 327 num_desc = (len - 1) / frame_size + 1; 328 329 if (!xsk_buff_can_alloc(xs->pool, num_desc)) { 330 xs->rx_dropped++; 331 return -ENOMEM; 332 } 333 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 334 xs->rx_queue_full++; 335 return -ENOBUFS; 336 } 337 338 if (xdp_buff_has_frags(xdp)) { 339 struct skb_shared_info *sinfo; 340 341 sinfo = xdp_get_shared_info_from_buff(xdp); 342 frag = &sinfo->frags[0]; 343 } 344 345 do { 346 u32 to_len = frame_size + meta_len; 347 u32 copied; 348 349 xsk_xdp = xsk_buff_alloc(xs->pool); 350 copy_to = xsk_xdp->data - meta_len; 351 352 copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); 353 rem -= copied; 354 355 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 356 __xsk_rcv_zc_safe(xs, xskb, copied - meta_len, 357 rem ? XDP_PKT_CONTD : 0); 358 meta_len = 0; 359 } while (rem); 360 361 return 0; 362 } 363 364 static bool xsk_tx_writeable(struct xdp_sock *xs) 365 { 366 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) 367 return false; 368 369 return true; 370 } 371 372 static void __xsk_tx_release(struct xdp_sock *xs) 373 { 374 __xskq_cons_release(xs->tx); 375 if (xsk_tx_writeable(xs)) 376 xs->sk.sk_write_space(&xs->sk); 377 } 378 379 static bool xsk_is_bound(struct xdp_sock *xs) 380 { 381 if (READ_ONCE(xs->state) == XSK_BOUND) { 382 /* Matches smp_wmb() in bind(). */ 383 smp_rmb(); 384 return true; 385 } 386 return false; 387 } 388 389 static bool xsk_dev_queue_valid(const struct xdp_sock *xs, 390 const struct xdp_rxq_info *info) 391 { 392 struct net_device *dev = xs->dev; 393 u32 queue_index = xs->queue_id; 394 struct netdev_rx_queue *rxq; 395 396 if (info->dev == dev && 397 info->queue_index == queue_index) 398 return true; 399 400 if (queue_index < dev->real_num_rx_queues) { 401 rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); 402 if (!rxq) 403 return false; 404 405 dev = rxq->dev; 406 queue_index = get_netdev_rx_queue_index(rxq); 407 408 return info->dev == dev && 409 info->queue_index == queue_index; 410 } 411 return false; 412 } 413 414 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 415 { 416 if (!xsk_is_bound(xs)) 417 return -ENXIO; 418 if (!xsk_dev_queue_valid(xs, xdp->rxq)) 419 return -EINVAL; 420 421 if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 422 xs->rx_dropped++; 423 return -ENOSPC; 424 } 425 426 return 0; 427 } 428 429 static void xsk_flush(struct xdp_sock *xs) 430 { 431 xskq_prod_submit(xs->rx); 432 __xskq_cons_release(xs->pool->fq); 433 sock_def_readable(&xs->sk); 434 } 435 436 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 437 { 438 u32 len = xdp_get_buff_len(xdp); 439 int err; 440 441 err = xsk_rcv_check(xs, xdp, len); 442 if (!err) { 443 spin_lock_bh(&xs->pool->rx_lock); 444 err = __xsk_rcv(xs, xdp, len); 445 xsk_flush(xs); 446 spin_unlock_bh(&xs->pool->rx_lock); 447 } 448 449 return err; 450 } 451 452 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 453 { 454 u32 len = xdp_get_buff_len(xdp); 455 int err; 456 457 err = xsk_rcv_check(xs, xdp, len); 458 if (err) 459 return err; 460 461 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { 462 len = xdp->data_end - xdp->data; 463 return xsk_rcv_zc(xs, xdp, len); 464 } 465 466 err = __xsk_rcv(xs, xdp, len); 467 if (!err) 468 xdp_return_buff(xdp); 469 return err; 470 } 471 472 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 473 { 474 int err; 475 476 err = xsk_rcv(xs, xdp); 477 if (err) 478 return err; 479 480 if (!xs->flush_node.prev) { 481 struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); 482 483 list_add(&xs->flush_node, flush_list); 484 } 485 486 return 0; 487 } 488 489 void __xsk_map_flush(struct list_head *flush_list) 490 { 491 struct xdp_sock *xs, *tmp; 492 493 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 494 xsk_flush(xs); 495 __list_del_clearprev(&xs->flush_node); 496 } 497 } 498 499 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) 500 { 501 xskq_prod_submit_n(pool->cq, nb_entries); 502 } 503 EXPORT_SYMBOL(xsk_tx_completed); 504 505 void xsk_tx_release(struct xsk_buff_pool *pool) 506 { 507 struct xdp_sock *xs; 508 509 rcu_read_lock(); 510 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 511 __xsk_tx_release(xs); 512 rcu_read_unlock(); 513 } 514 EXPORT_SYMBOL(xsk_tx_release); 515 516 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 517 { 518 bool budget_exhausted = false; 519 struct xdp_sock *xs; 520 521 rcu_read_lock(); 522 again: 523 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 524 if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { 525 budget_exhausted = true; 526 continue; 527 } 528 529 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { 530 if (xskq_has_descs(xs->tx)) 531 xskq_cons_release(xs->tx); 532 continue; 533 } 534 535 xs->tx_budget_spent++; 536 537 /* This is the backpressure mechanism for the Tx path. 538 * Reserve space in the completion queue and only proceed 539 * if there is space in it. This avoids having to implement 540 * any buffering in the Tx path. 541 */ 542 if (xskq_prod_reserve_addr(pool->cq, desc->addr)) 543 goto out; 544 545 xskq_cons_release(xs->tx); 546 rcu_read_unlock(); 547 return true; 548 } 549 550 if (budget_exhausted) { 551 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 552 xs->tx_budget_spent = 0; 553 554 budget_exhausted = false; 555 goto again; 556 } 557 558 out: 559 rcu_read_unlock(); 560 return false; 561 } 562 EXPORT_SYMBOL(xsk_tx_peek_desc); 563 564 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) 565 { 566 struct xdp_desc *descs = pool->tx_descs; 567 u32 nb_pkts = 0; 568 569 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 570 nb_pkts++; 571 572 xsk_tx_release(pool); 573 return nb_pkts; 574 } 575 576 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) 577 { 578 struct xdp_sock *xs; 579 580 rcu_read_lock(); 581 if (!list_is_singular(&pool->xsk_tx_list)) { 582 /* Fallback to the non-batched version */ 583 rcu_read_unlock(); 584 return xsk_tx_peek_release_fallback(pool, nb_pkts); 585 } 586 587 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 588 if (!xs) { 589 nb_pkts = 0; 590 goto out; 591 } 592 593 nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); 594 595 /* This is the backpressure mechanism for the Tx path. Try to 596 * reserve space in the completion queue for all packets, but 597 * if there are fewer slots available, just process that many 598 * packets. This avoids having to implement any buffering in 599 * the Tx path. 600 */ 601 nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); 602 if (!nb_pkts) 603 goto out; 604 605 nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); 606 if (!nb_pkts) { 607 xs->tx->queue_empty_descs++; 608 goto out; 609 } 610 611 __xskq_cons_release(xs->tx); 612 xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); 613 xs->sk.sk_write_space(&xs->sk); 614 615 out: 616 rcu_read_unlock(); 617 return nb_pkts; 618 } 619 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 620 621 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 622 { 623 struct net_device *dev = xs->dev; 624 625 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 626 } 627 628 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) 629 { 630 int ret; 631 632 spin_lock(&pool->cq->cq_cached_prod_lock); 633 ret = xskq_prod_reserve(pool->cq); 634 spin_unlock(&pool->cq->cq_cached_prod_lock); 635 636 return ret; 637 } 638 639 static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) 640 { 641 return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; 642 } 643 644 static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) 645 { 646 return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); 647 } 648 649 static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) 650 { 651 skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); 652 } 653 654 static void xsk_inc_num_desc(struct sk_buff *skb) 655 { 656 struct xsk_addrs *xsk_addr; 657 658 if (!xsk_skb_destructor_is_addr(skb)) { 659 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 660 xsk_addr->num_descs++; 661 } 662 } 663 664 static u32 xsk_get_num_desc(struct sk_buff *skb) 665 { 666 struct xsk_addrs *xsk_addr; 667 668 if (xsk_skb_destructor_is_addr(skb)) 669 return 1; 670 671 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 672 673 return xsk_addr->num_descs; 674 } 675 676 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, 677 struct sk_buff *skb) 678 { 679 u32 num_descs = xsk_get_num_desc(skb); 680 struct xsk_addrs *xsk_addr; 681 u32 descs_processed = 0; 682 unsigned long flags; 683 u32 idx, i; 684 685 spin_lock_irqsave(&pool->cq_prod_lock, flags); 686 idx = xskq_get_prod(pool->cq); 687 688 if (unlikely(num_descs > 1)) { 689 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 690 691 for (i = 0; i < num_descs; i++) { 692 xskq_prod_write_addr(pool->cq, idx + descs_processed, 693 xsk_addr->addrs[i]); 694 descs_processed++; 695 } 696 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 697 } else { 698 xskq_prod_write_addr(pool->cq, idx, 699 xsk_skb_destructor_get_addr(skb)); 700 descs_processed++; 701 } 702 xskq_prod_submit_n(pool->cq, descs_processed); 703 spin_unlock_irqrestore(&pool->cq_prod_lock, flags); 704 } 705 706 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) 707 { 708 spin_lock(&pool->cq->cq_cached_prod_lock); 709 xskq_prod_cancel_n(pool->cq, n); 710 spin_unlock(&pool->cq->cq_cached_prod_lock); 711 } 712 713 INDIRECT_CALLABLE_SCOPE 714 void xsk_destruct_skb(struct sk_buff *skb) 715 { 716 struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; 717 718 if (compl->tx_timestamp) { 719 /* sw completion timestamp, not a real one */ 720 *compl->tx_timestamp = ktime_get_tai_fast_ns(); 721 } 722 723 xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); 724 sock_wfree(skb); 725 } 726 727 static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, 728 u64 addr) 729 { 730 skb->dev = xs->dev; 731 skb->priority = READ_ONCE(xs->sk.sk_priority); 732 skb->mark = READ_ONCE(xs->sk.sk_mark); 733 skb->destructor = xsk_destruct_skb; 734 xsk_skb_destructor_set_addr(skb, addr); 735 } 736 737 static void xsk_consume_skb(struct sk_buff *skb) 738 { 739 struct xdp_sock *xs = xdp_sk(skb->sk); 740 u32 num_descs = xsk_get_num_desc(skb); 741 struct xsk_addrs *xsk_addr; 742 743 if (unlikely(num_descs > 1)) { 744 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 745 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 746 } 747 748 skb->destructor = sock_wfree; 749 xsk_cq_cancel_locked(xs->pool, num_descs); 750 /* Free skb without triggering the perf drop trace */ 751 consume_skb(skb); 752 xs->skb = NULL; 753 } 754 755 static void xsk_drop_skb(struct sk_buff *skb) 756 { 757 xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); 758 xsk_consume_skb(skb); 759 } 760 761 static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, 762 struct xdp_desc *desc, struct xsk_buff_pool *pool, 763 u32 hr) 764 { 765 struct xsk_tx_metadata *meta = NULL; 766 767 if (unlikely(pool->tx_metadata_len == 0)) 768 return -EINVAL; 769 770 meta = buffer - pool->tx_metadata_len; 771 if (unlikely(!xsk_buff_valid_tx_metadata(meta))) 772 return -EINVAL; 773 774 if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { 775 if (unlikely(meta->request.csum_start + 776 meta->request.csum_offset + 777 sizeof(__sum16) > desc->len)) 778 return -EINVAL; 779 780 skb->csum_start = hr + meta->request.csum_start; 781 skb->csum_offset = meta->request.csum_offset; 782 skb->ip_summed = CHECKSUM_PARTIAL; 783 784 if (unlikely(pool->tx_sw_csum)) { 785 int err; 786 787 err = skb_checksum_help(skb); 788 if (err) 789 return err; 790 } 791 } 792 793 if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) 794 skb->skb_mstamp_ns = meta->request.launch_time; 795 xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); 796 797 return 0; 798 } 799 800 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, 801 struct xdp_desc *desc) 802 { 803 struct xsk_buff_pool *pool = xs->pool; 804 u32 hr, len, ts, offset, copy, copied; 805 struct sk_buff *skb = xs->skb; 806 struct page *page; 807 void *buffer; 808 int err, i; 809 u64 addr; 810 811 addr = desc->addr; 812 buffer = xsk_buff_raw_get_data(pool, addr); 813 814 if (!skb) { 815 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); 816 817 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); 818 if (unlikely(!skb)) 819 return ERR_PTR(err); 820 821 skb_reserve(skb, hr); 822 823 xsk_skb_init_misc(skb, xs, desc->addr); 824 if (desc->options & XDP_TX_METADATA) { 825 err = xsk_skb_metadata(skb, buffer, desc, pool, hr); 826 if (unlikely(err)) 827 return ERR_PTR(err); 828 } 829 } else { 830 struct xsk_addrs *xsk_addr; 831 832 if (xsk_skb_destructor_is_addr(skb)) { 833 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, 834 GFP_KERNEL); 835 if (!xsk_addr) 836 return ERR_PTR(-ENOMEM); 837 838 xsk_addr->num_descs = 1; 839 xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); 840 skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 841 } else { 842 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 843 } 844 845 /* in case of -EOVERFLOW that could happen below, 846 * xsk_consume_skb() will release this node as whole skb 847 * would be dropped, which implies freeing all list elements 848 */ 849 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 850 } 851 852 len = desc->len; 853 ts = pool->unaligned ? len : pool->chunk_size; 854 855 offset = offset_in_page(buffer); 856 addr = buffer - pool->addrs; 857 858 for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { 859 if (unlikely(i >= MAX_SKB_FRAGS)) 860 return ERR_PTR(-EOVERFLOW); 861 862 page = pool->umem->pgs[addr >> PAGE_SHIFT]; 863 get_page(page); 864 865 copy = min_t(u32, PAGE_SIZE - offset, len - copied); 866 skb_fill_page_desc(skb, i, page, offset, copy); 867 868 copied += copy; 869 addr += copy; 870 offset = 0; 871 } 872 873 skb->len += len; 874 skb->data_len += len; 875 skb->truesize += ts; 876 877 refcount_add(ts, &xs->sk.sk_wmem_alloc); 878 879 return skb; 880 } 881 882 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, 883 struct xdp_desc *desc) 884 { 885 struct net_device *dev = xs->dev; 886 struct sk_buff *skb = xs->skb; 887 int err; 888 889 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { 890 skb = xsk_build_skb_zerocopy(xs, desc); 891 if (IS_ERR(skb)) { 892 err = PTR_ERR(skb); 893 skb = NULL; 894 goto free_err; 895 } 896 } else { 897 u32 hr, tr, len; 898 void *buffer; 899 900 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); 901 len = desc->len; 902 903 if (!skb) { 904 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); 905 tr = dev->needed_tailroom; 906 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); 907 if (unlikely(!skb)) 908 goto free_err; 909 910 skb_reserve(skb, hr); 911 skb_put(skb, len); 912 913 err = skb_store_bits(skb, 0, buffer, len); 914 if (unlikely(err)) 915 goto free_err; 916 917 xsk_skb_init_misc(skb, xs, desc->addr); 918 if (desc->options & XDP_TX_METADATA) { 919 err = xsk_skb_metadata(skb, buffer, desc, 920 xs->pool, hr); 921 if (unlikely(err)) 922 goto free_err; 923 } 924 } else { 925 int nr_frags = skb_shinfo(skb)->nr_frags; 926 struct xsk_addrs *xsk_addr; 927 struct page *page; 928 u8 *vaddr; 929 930 if (xsk_skb_destructor_is_addr(skb)) { 931 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, 932 GFP_KERNEL); 933 if (!xsk_addr) { 934 err = -ENOMEM; 935 goto free_err; 936 } 937 938 xsk_addr->num_descs = 1; 939 xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); 940 skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 941 } else { 942 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 943 } 944 945 if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 946 err = -EOVERFLOW; 947 goto free_err; 948 } 949 950 page = alloc_page(xs->sk.sk_allocation); 951 if (unlikely(!page)) { 952 err = -EAGAIN; 953 goto free_err; 954 } 955 956 vaddr = kmap_local_page(page); 957 memcpy(vaddr, buffer, len); 958 kunmap_local(vaddr); 959 960 skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 961 refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 962 963 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 964 } 965 } 966 967 xsk_inc_num_desc(skb); 968 969 return skb; 970 971 free_err: 972 if (skb && !skb_shinfo(skb)->nr_frags) 973 kfree_skb(skb); 974 975 if (err == -EOVERFLOW) { 976 /* Drop the packet */ 977 xsk_inc_num_desc(xs->skb); 978 xsk_drop_skb(xs->skb); 979 xskq_cons_release(xs->tx); 980 } else { 981 /* Let application retry */ 982 xsk_cq_cancel_locked(xs->pool, 1); 983 } 984 985 return ERR_PTR(err); 986 } 987 988 static int __xsk_generic_xmit(struct sock *sk) 989 { 990 struct xdp_sock *xs = xdp_sk(sk); 991 bool sent_frame = false; 992 struct xdp_desc desc; 993 struct sk_buff *skb; 994 u32 max_batch; 995 int err = 0; 996 997 mutex_lock(&xs->mutex); 998 999 /* Since we dropped the RCU read lock, the socket state might have changed. */ 1000 if (unlikely(!xsk_is_bound(xs))) { 1001 err = -ENXIO; 1002 goto out; 1003 } 1004 1005 if (xs->queue_id >= xs->dev->real_num_tx_queues) 1006 goto out; 1007 1008 max_batch = READ_ONCE(xs->max_tx_budget); 1009 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { 1010 if (max_batch-- == 0) { 1011 err = -EAGAIN; 1012 goto out; 1013 } 1014 1015 /* This is the backpressure mechanism for the Tx path. 1016 * Reserve space in the completion queue and only proceed 1017 * if there is space in it. This avoids having to implement 1018 * any buffering in the Tx path. 1019 */ 1020 err = xsk_cq_reserve_locked(xs->pool); 1021 if (err) { 1022 err = -EAGAIN; 1023 goto out; 1024 } 1025 1026 skb = xsk_build_skb(xs, &desc); 1027 if (IS_ERR(skb)) { 1028 err = PTR_ERR(skb); 1029 if (err != -EOVERFLOW) 1030 goto out; 1031 err = 0; 1032 continue; 1033 } 1034 1035 xskq_cons_release(xs->tx); 1036 1037 if (xp_mb_desc(&desc)) { 1038 xs->skb = skb; 1039 continue; 1040 } 1041 1042 err = __dev_direct_xmit(skb, xs->queue_id); 1043 if (err == NETDEV_TX_BUSY) { 1044 /* Tell user-space to retry the send */ 1045 xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); 1046 xsk_consume_skb(skb); 1047 err = -EAGAIN; 1048 goto out; 1049 } 1050 1051 /* Ignore NET_XMIT_CN as packet might have been sent */ 1052 if (err == NET_XMIT_DROP) { 1053 /* SKB completed but not sent */ 1054 err = -EBUSY; 1055 xs->skb = NULL; 1056 goto out; 1057 } 1058 1059 sent_frame = true; 1060 xs->skb = NULL; 1061 } 1062 1063 if (xskq_has_descs(xs->tx)) { 1064 if (xs->skb) 1065 xsk_drop_skb(xs->skb); 1066 xskq_cons_release(xs->tx); 1067 } 1068 1069 out: 1070 if (sent_frame) 1071 __xsk_tx_release(xs); 1072 1073 mutex_unlock(&xs->mutex); 1074 return err; 1075 } 1076 1077 static int xsk_generic_xmit(struct sock *sk) 1078 { 1079 int ret; 1080 1081 /* Drop the RCU lock since the SKB path might sleep. */ 1082 rcu_read_unlock(); 1083 ret = __xsk_generic_xmit(sk); 1084 /* Reaquire RCU lock before going into common code. */ 1085 rcu_read_lock(); 1086 1087 return ret; 1088 } 1089 1090 static bool xsk_no_wakeup(struct sock *sk) 1091 { 1092 #ifdef CONFIG_NET_RX_BUSY_POLL 1093 /* Prefer busy-polling, skip the wakeup. */ 1094 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 1095 napi_id_valid(READ_ONCE(sk->sk_napi_id)); 1096 #else 1097 return false; 1098 #endif 1099 } 1100 1101 static int xsk_check_common(struct xdp_sock *xs) 1102 { 1103 if (unlikely(!xsk_is_bound(xs))) 1104 return -ENXIO; 1105 if (unlikely(!(xs->dev->flags & IFF_UP))) 1106 return -ENETDOWN; 1107 1108 return 0; 1109 } 1110 1111 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1112 { 1113 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 1114 struct sock *sk = sock->sk; 1115 struct xdp_sock *xs = xdp_sk(sk); 1116 struct xsk_buff_pool *pool; 1117 int err; 1118 1119 err = xsk_check_common(xs); 1120 if (err) 1121 return err; 1122 if (unlikely(need_wait)) 1123 return -EOPNOTSUPP; 1124 if (unlikely(!xs->tx)) 1125 return -ENOBUFS; 1126 1127 if (sk_can_busy_loop(sk)) 1128 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1129 1130 if (xs->zc && xsk_no_wakeup(sk)) 1131 return 0; 1132 1133 pool = xs->pool; 1134 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { 1135 if (xs->zc) 1136 return xsk_wakeup(xs, XDP_WAKEUP_TX); 1137 return xsk_generic_xmit(sk); 1138 } 1139 return 0; 1140 } 1141 1142 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1143 { 1144 int ret; 1145 1146 rcu_read_lock(); 1147 ret = __xsk_sendmsg(sock, m, total_len); 1148 rcu_read_unlock(); 1149 1150 return ret; 1151 } 1152 1153 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1154 { 1155 bool need_wait = !(flags & MSG_DONTWAIT); 1156 struct sock *sk = sock->sk; 1157 struct xdp_sock *xs = xdp_sk(sk); 1158 int err; 1159 1160 err = xsk_check_common(xs); 1161 if (err) 1162 return err; 1163 if (unlikely(!xs->rx)) 1164 return -ENOBUFS; 1165 if (unlikely(need_wait)) 1166 return -EOPNOTSUPP; 1167 1168 if (sk_can_busy_loop(sk)) 1169 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1170 1171 if (xsk_no_wakeup(sk)) 1172 return 0; 1173 1174 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 1175 return xsk_wakeup(xs, XDP_WAKEUP_RX); 1176 return 0; 1177 } 1178 1179 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1180 { 1181 int ret; 1182 1183 rcu_read_lock(); 1184 ret = __xsk_recvmsg(sock, m, len, flags); 1185 rcu_read_unlock(); 1186 1187 return ret; 1188 } 1189 1190 static __poll_t xsk_poll(struct file *file, struct socket *sock, 1191 struct poll_table_struct *wait) 1192 { 1193 __poll_t mask = 0; 1194 struct sock *sk = sock->sk; 1195 struct xdp_sock *xs = xdp_sk(sk); 1196 struct xsk_buff_pool *pool; 1197 1198 sock_poll_wait(file, sock, wait); 1199 1200 rcu_read_lock(); 1201 if (xsk_check_common(xs)) 1202 goto out; 1203 1204 pool = xs->pool; 1205 1206 if (pool->cached_need_wakeup) { 1207 if (xs->zc) 1208 xsk_wakeup(xs, pool->cached_need_wakeup); 1209 else if (xs->tx) 1210 /* Poll needs to drive Tx also in copy mode */ 1211 xsk_generic_xmit(sk); 1212 } 1213 1214 if (xs->rx && !xskq_prod_is_empty(xs->rx)) 1215 mask |= EPOLLIN | EPOLLRDNORM; 1216 if (xs->tx && xsk_tx_writeable(xs)) 1217 mask |= EPOLLOUT | EPOLLWRNORM; 1218 out: 1219 rcu_read_unlock(); 1220 return mask; 1221 } 1222 1223 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 1224 bool umem_queue) 1225 { 1226 struct xsk_queue *q; 1227 1228 if (entries == 0 || *queue || !is_power_of_2(entries)) 1229 return -EINVAL; 1230 1231 q = xskq_create(entries, umem_queue); 1232 if (!q) 1233 return -ENOMEM; 1234 1235 /* Make sure queue is ready before it can be seen by others */ 1236 smp_wmb(); 1237 WRITE_ONCE(*queue, q); 1238 return 0; 1239 } 1240 1241 static void xsk_unbind_dev(struct xdp_sock *xs) 1242 { 1243 struct net_device *dev = xs->dev; 1244 1245 if (xs->state != XSK_BOUND) 1246 return; 1247 WRITE_ONCE(xs->state, XSK_UNBOUND); 1248 1249 /* Wait for driver to stop using the xdp socket. */ 1250 xp_del_xsk(xs->pool, xs); 1251 synchronize_net(); 1252 dev_put(dev); 1253 } 1254 1255 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 1256 struct xdp_sock __rcu ***map_entry) 1257 { 1258 struct xsk_map *map = NULL; 1259 struct xsk_map_node *node; 1260 1261 *map_entry = NULL; 1262 1263 spin_lock_bh(&xs->map_list_lock); 1264 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 1265 node); 1266 if (node) { 1267 bpf_map_inc(&node->map->map); 1268 map = node->map; 1269 *map_entry = node->map_entry; 1270 } 1271 spin_unlock_bh(&xs->map_list_lock); 1272 return map; 1273 } 1274 1275 static void xsk_delete_from_maps(struct xdp_sock *xs) 1276 { 1277 /* This function removes the current XDP socket from all the 1278 * maps it resides in. We need to take extra care here, due to 1279 * the two locks involved. Each map has a lock synchronizing 1280 * updates to the entries, and each socket has a lock that 1281 * synchronizes access to the list of maps (map_list). For 1282 * deadlock avoidance the locks need to be taken in the order 1283 * "map lock"->"socket map list lock". We start off by 1284 * accessing the socket map list, and take a reference to the 1285 * map to guarantee existence between the 1286 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 1287 * calls. Then we ask the map to remove the socket, which 1288 * tries to remove the socket from the map. Note that there 1289 * might be updates to the map between 1290 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 1291 */ 1292 struct xdp_sock __rcu **map_entry = NULL; 1293 struct xsk_map *map; 1294 1295 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 1296 xsk_map_try_sock_delete(map, xs, map_entry); 1297 bpf_map_put(&map->map); 1298 } 1299 } 1300 1301 static int xsk_release(struct socket *sock) 1302 { 1303 struct sock *sk = sock->sk; 1304 struct xdp_sock *xs = xdp_sk(sk); 1305 struct net *net; 1306 1307 if (!sk) 1308 return 0; 1309 1310 net = sock_net(sk); 1311 1312 if (xs->skb) 1313 xsk_drop_skb(xs->skb); 1314 1315 mutex_lock(&net->xdp.lock); 1316 sk_del_node_init_rcu(sk); 1317 mutex_unlock(&net->xdp.lock); 1318 1319 sock_prot_inuse_add(net, sk->sk_prot, -1); 1320 1321 xsk_delete_from_maps(xs); 1322 mutex_lock(&xs->mutex); 1323 xsk_unbind_dev(xs); 1324 mutex_unlock(&xs->mutex); 1325 1326 xskq_destroy(xs->rx); 1327 xskq_destroy(xs->tx); 1328 xskq_destroy(xs->fq_tmp); 1329 xskq_destroy(xs->cq_tmp); 1330 1331 sock_orphan(sk); 1332 sock->sk = NULL; 1333 1334 sock_put(sk); 1335 1336 return 0; 1337 } 1338 1339 static struct socket *xsk_lookup_xsk_from_fd(int fd) 1340 { 1341 struct socket *sock; 1342 int err; 1343 1344 sock = sockfd_lookup(fd, &err); 1345 if (!sock) 1346 return ERR_PTR(-ENOTSOCK); 1347 1348 if (sock->sk->sk_family != PF_XDP) { 1349 sockfd_put(sock); 1350 return ERR_PTR(-ENOPROTOOPT); 1351 } 1352 1353 return sock; 1354 } 1355 1356 static bool xsk_validate_queues(struct xdp_sock *xs) 1357 { 1358 return xs->fq_tmp && xs->cq_tmp; 1359 } 1360 1361 static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) 1362 { 1363 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 1364 struct sock *sk = sock->sk; 1365 struct xdp_sock *xs = xdp_sk(sk); 1366 struct net_device *dev; 1367 int bound_dev_if; 1368 u32 flags, qid; 1369 int err = 0; 1370 1371 if (addr_len < sizeof(struct sockaddr_xdp)) 1372 return -EINVAL; 1373 if (sxdp->sxdp_family != AF_XDP) 1374 return -EINVAL; 1375 1376 flags = sxdp->sxdp_flags; 1377 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 1378 XDP_USE_NEED_WAKEUP | XDP_USE_SG)) 1379 return -EINVAL; 1380 1381 bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 1382 if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) 1383 return -EINVAL; 1384 1385 rtnl_lock(); 1386 mutex_lock(&xs->mutex); 1387 if (xs->state != XSK_READY) { 1388 err = -EBUSY; 1389 goto out_release; 1390 } 1391 1392 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 1393 if (!dev) { 1394 err = -ENODEV; 1395 goto out_release; 1396 } 1397 1398 netdev_lock_ops(dev); 1399 1400 if (!xs->rx && !xs->tx) { 1401 err = -EINVAL; 1402 goto out_unlock; 1403 } 1404 1405 qid = sxdp->sxdp_queue_id; 1406 1407 if (flags & XDP_SHARED_UMEM) { 1408 struct xdp_sock *umem_xs; 1409 struct socket *sock; 1410 1411 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 1412 (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { 1413 /* Cannot specify flags for shared sockets. */ 1414 err = -EINVAL; 1415 goto out_unlock; 1416 } 1417 1418 if (xs->umem) { 1419 /* We have already our own. */ 1420 err = -EINVAL; 1421 goto out_unlock; 1422 } 1423 1424 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 1425 if (IS_ERR(sock)) { 1426 err = PTR_ERR(sock); 1427 goto out_unlock; 1428 } 1429 1430 umem_xs = xdp_sk(sock->sk); 1431 if (!xsk_is_bound(umem_xs)) { 1432 err = -EBADF; 1433 sockfd_put(sock); 1434 goto out_unlock; 1435 } 1436 1437 if (umem_xs->queue_id != qid || umem_xs->dev != dev) { 1438 /* One fill and completion ring required for each queue id. */ 1439 if (!xsk_validate_queues(xs)) { 1440 err = -EINVAL; 1441 sockfd_put(sock); 1442 goto out_unlock; 1443 } 1444 1445 /* Share the umem with another socket on another qid 1446 * and/or device. 1447 */ 1448 xs->pool = xp_create_and_assign_umem(xs, 1449 umem_xs->umem); 1450 if (!xs->pool) { 1451 err = -ENOMEM; 1452 sockfd_put(sock); 1453 goto out_unlock; 1454 } 1455 1456 err = xp_assign_dev_shared(xs->pool, umem_xs, dev, 1457 qid); 1458 if (err) { 1459 xp_destroy(xs->pool); 1460 xs->pool = NULL; 1461 sockfd_put(sock); 1462 goto out_unlock; 1463 } 1464 } else { 1465 /* Share the buffer pool with the other socket. */ 1466 if (xs->fq_tmp || xs->cq_tmp) { 1467 /* Do not allow setting your own fq or cq. */ 1468 err = -EINVAL; 1469 sockfd_put(sock); 1470 goto out_unlock; 1471 } 1472 1473 xp_get_pool(umem_xs->pool); 1474 xs->pool = umem_xs->pool; 1475 1476 /* If underlying shared umem was created without Tx 1477 * ring, allocate Tx descs array that Tx batching API 1478 * utilizes 1479 */ 1480 if (xs->tx && !xs->pool->tx_descs) { 1481 err = xp_alloc_tx_descs(xs->pool, xs); 1482 if (err) { 1483 xp_put_pool(xs->pool); 1484 xs->pool = NULL; 1485 sockfd_put(sock); 1486 goto out_unlock; 1487 } 1488 } 1489 } 1490 1491 xdp_get_umem(umem_xs->umem); 1492 WRITE_ONCE(xs->umem, umem_xs->umem); 1493 sockfd_put(sock); 1494 } else if (!xs->umem || !xsk_validate_queues(xs)) { 1495 err = -EINVAL; 1496 goto out_unlock; 1497 } else { 1498 /* This xsk has its own umem. */ 1499 xs->pool = xp_create_and_assign_umem(xs, xs->umem); 1500 if (!xs->pool) { 1501 err = -ENOMEM; 1502 goto out_unlock; 1503 } 1504 1505 err = xp_assign_dev(xs->pool, dev, qid, flags); 1506 if (err) { 1507 xp_destroy(xs->pool); 1508 xs->pool = NULL; 1509 goto out_unlock; 1510 } 1511 } 1512 1513 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 1514 xs->fq_tmp = NULL; 1515 xs->cq_tmp = NULL; 1516 1517 xs->dev = dev; 1518 xs->zc = xs->umem->zc; 1519 xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); 1520 xs->queue_id = qid; 1521 xp_add_xsk(xs->pool, xs); 1522 1523 if (qid < dev->real_num_rx_queues) { 1524 struct netdev_rx_queue *rxq; 1525 1526 rxq = __netif_get_rx_queue(dev, qid); 1527 if (rxq->napi) 1528 __sk_mark_napi_id_once(sk, rxq->napi->napi_id); 1529 } 1530 1531 out_unlock: 1532 if (err) { 1533 dev_put(dev); 1534 } else { 1535 /* Matches smp_rmb() in bind() for shared umem 1536 * sockets, and xsk_is_bound(). 1537 */ 1538 smp_wmb(); 1539 WRITE_ONCE(xs->state, XSK_BOUND); 1540 } 1541 netdev_unlock_ops(dev); 1542 out_release: 1543 mutex_unlock(&xs->mutex); 1544 rtnl_unlock(); 1545 return err; 1546 } 1547 1548 struct xdp_umem_reg_v1 { 1549 __u64 addr; /* Start of packet data area */ 1550 __u64 len; /* Length of packet data area */ 1551 __u32 chunk_size; 1552 __u32 headroom; 1553 }; 1554 1555 static int xsk_setsockopt(struct socket *sock, int level, int optname, 1556 sockptr_t optval, unsigned int optlen) 1557 { 1558 struct sock *sk = sock->sk; 1559 struct xdp_sock *xs = xdp_sk(sk); 1560 int err; 1561 1562 if (level != SOL_XDP) 1563 return -ENOPROTOOPT; 1564 1565 switch (optname) { 1566 case XDP_RX_RING: 1567 case XDP_TX_RING: 1568 { 1569 struct xsk_queue **q; 1570 int entries; 1571 1572 if (optlen < sizeof(entries)) 1573 return -EINVAL; 1574 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1575 return -EFAULT; 1576 1577 mutex_lock(&xs->mutex); 1578 if (xs->state != XSK_READY) { 1579 mutex_unlock(&xs->mutex); 1580 return -EBUSY; 1581 } 1582 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 1583 err = xsk_init_queue(entries, q, false); 1584 if (!err && optname == XDP_TX_RING) 1585 /* Tx needs to be explicitly woken up the first time */ 1586 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 1587 mutex_unlock(&xs->mutex); 1588 return err; 1589 } 1590 case XDP_UMEM_REG: 1591 { 1592 size_t mr_size = sizeof(struct xdp_umem_reg); 1593 struct xdp_umem_reg mr = {}; 1594 struct xdp_umem *umem; 1595 1596 if (optlen < sizeof(struct xdp_umem_reg_v1)) 1597 return -EINVAL; 1598 else if (optlen < sizeof(mr)) 1599 mr_size = sizeof(struct xdp_umem_reg_v1); 1600 1601 BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); 1602 1603 /* Make sure the last field of the struct doesn't have 1604 * uninitialized padding. All padding has to be explicit 1605 * and has to be set to zero by the userspace to make 1606 * struct xdp_umem_reg extensible in the future. 1607 */ 1608 BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + 1609 sizeof_field(struct xdp_umem_reg, tx_metadata_len) != 1610 sizeof(struct xdp_umem_reg)); 1611 1612 if (copy_from_sockptr(&mr, optval, mr_size)) 1613 return -EFAULT; 1614 1615 mutex_lock(&xs->mutex); 1616 if (xs->state != XSK_READY || xs->umem) { 1617 mutex_unlock(&xs->mutex); 1618 return -EBUSY; 1619 } 1620 1621 umem = xdp_umem_create(&mr); 1622 if (IS_ERR(umem)) { 1623 mutex_unlock(&xs->mutex); 1624 return PTR_ERR(umem); 1625 } 1626 1627 /* Make sure umem is ready before it can be seen by others */ 1628 smp_wmb(); 1629 WRITE_ONCE(xs->umem, umem); 1630 mutex_unlock(&xs->mutex); 1631 return 0; 1632 } 1633 case XDP_UMEM_FILL_RING: 1634 case XDP_UMEM_COMPLETION_RING: 1635 { 1636 struct xsk_queue **q; 1637 int entries; 1638 1639 if (optlen < sizeof(entries)) 1640 return -EINVAL; 1641 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1642 return -EFAULT; 1643 1644 mutex_lock(&xs->mutex); 1645 if (xs->state != XSK_READY) { 1646 mutex_unlock(&xs->mutex); 1647 return -EBUSY; 1648 } 1649 1650 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : 1651 &xs->cq_tmp; 1652 err = xsk_init_queue(entries, q, true); 1653 mutex_unlock(&xs->mutex); 1654 return err; 1655 } 1656 case XDP_MAX_TX_SKB_BUDGET: 1657 { 1658 unsigned int budget; 1659 1660 if (optlen != sizeof(budget)) 1661 return -EINVAL; 1662 if (copy_from_sockptr(&budget, optval, sizeof(budget))) 1663 return -EFAULT; 1664 if (!xs->tx || 1665 budget < TX_BATCH_SIZE || budget > xs->tx->nentries) 1666 return -EACCES; 1667 1668 WRITE_ONCE(xs->max_tx_budget, budget); 1669 return 0; 1670 } 1671 default: 1672 break; 1673 } 1674 1675 return -ENOPROTOOPT; 1676 } 1677 1678 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 1679 { 1680 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 1681 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 1682 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 1683 } 1684 1685 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 1686 { 1687 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 1688 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 1689 ring->desc = offsetof(struct xdp_umem_ring, desc); 1690 } 1691 1692 struct xdp_statistics_v1 { 1693 __u64 rx_dropped; 1694 __u64 rx_invalid_descs; 1695 __u64 tx_invalid_descs; 1696 }; 1697 1698 static int xsk_getsockopt(struct socket *sock, int level, int optname, 1699 char __user *optval, int __user *optlen) 1700 { 1701 struct sock *sk = sock->sk; 1702 struct xdp_sock *xs = xdp_sk(sk); 1703 int len; 1704 1705 if (level != SOL_XDP) 1706 return -ENOPROTOOPT; 1707 1708 if (get_user(len, optlen)) 1709 return -EFAULT; 1710 if (len < 0) 1711 return -EINVAL; 1712 1713 switch (optname) { 1714 case XDP_STATISTICS: 1715 { 1716 struct xdp_statistics stats = {}; 1717 bool extra_stats = true; 1718 size_t stats_size; 1719 1720 if (len < sizeof(struct xdp_statistics_v1)) { 1721 return -EINVAL; 1722 } else if (len < sizeof(stats)) { 1723 extra_stats = false; 1724 stats_size = sizeof(struct xdp_statistics_v1); 1725 } else { 1726 stats_size = sizeof(stats); 1727 } 1728 1729 mutex_lock(&xs->mutex); 1730 stats.rx_dropped = xs->rx_dropped; 1731 if (extra_stats) { 1732 stats.rx_ring_full = xs->rx_queue_full; 1733 stats.rx_fill_ring_empty_descs = 1734 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; 1735 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); 1736 } else { 1737 stats.rx_dropped += xs->rx_queue_full; 1738 } 1739 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 1740 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 1741 mutex_unlock(&xs->mutex); 1742 1743 if (copy_to_user(optval, &stats, stats_size)) 1744 return -EFAULT; 1745 if (put_user(stats_size, optlen)) 1746 return -EFAULT; 1747 1748 return 0; 1749 } 1750 case XDP_MMAP_OFFSETS: 1751 { 1752 struct xdp_mmap_offsets off; 1753 struct xdp_mmap_offsets_v1 off_v1; 1754 bool flags_supported = true; 1755 void *to_copy; 1756 1757 if (len < sizeof(off_v1)) 1758 return -EINVAL; 1759 else if (len < sizeof(off)) 1760 flags_supported = false; 1761 1762 if (flags_supported) { 1763 /* xdp_ring_offset is identical to xdp_ring_offset_v1 1764 * except for the flags field added to the end. 1765 */ 1766 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1767 &off.rx); 1768 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1769 &off.tx); 1770 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1771 &off.fr); 1772 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1773 &off.cr); 1774 off.rx.flags = offsetof(struct xdp_rxtx_ring, 1775 ptrs.flags); 1776 off.tx.flags = offsetof(struct xdp_rxtx_ring, 1777 ptrs.flags); 1778 off.fr.flags = offsetof(struct xdp_umem_ring, 1779 ptrs.flags); 1780 off.cr.flags = offsetof(struct xdp_umem_ring, 1781 ptrs.flags); 1782 1783 len = sizeof(off); 1784 to_copy = &off; 1785 } else { 1786 xsk_enter_rxtx_offsets(&off_v1.rx); 1787 xsk_enter_rxtx_offsets(&off_v1.tx); 1788 xsk_enter_umem_offsets(&off_v1.fr); 1789 xsk_enter_umem_offsets(&off_v1.cr); 1790 1791 len = sizeof(off_v1); 1792 to_copy = &off_v1; 1793 } 1794 1795 if (copy_to_user(optval, to_copy, len)) 1796 return -EFAULT; 1797 if (put_user(len, optlen)) 1798 return -EFAULT; 1799 1800 return 0; 1801 } 1802 case XDP_OPTIONS: 1803 { 1804 struct xdp_options opts = {}; 1805 1806 if (len < sizeof(opts)) 1807 return -EINVAL; 1808 1809 mutex_lock(&xs->mutex); 1810 if (xs->zc) 1811 opts.flags |= XDP_OPTIONS_ZEROCOPY; 1812 mutex_unlock(&xs->mutex); 1813 1814 len = sizeof(opts); 1815 if (copy_to_user(optval, &opts, len)) 1816 return -EFAULT; 1817 if (put_user(len, optlen)) 1818 return -EFAULT; 1819 1820 return 0; 1821 } 1822 default: 1823 break; 1824 } 1825 1826 return -EOPNOTSUPP; 1827 } 1828 1829 static int xsk_mmap(struct file *file, struct socket *sock, 1830 struct vm_area_struct *vma) 1831 { 1832 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 1833 unsigned long size = vma->vm_end - vma->vm_start; 1834 struct xdp_sock *xs = xdp_sk(sock->sk); 1835 int state = READ_ONCE(xs->state); 1836 struct xsk_queue *q = NULL; 1837 1838 if (state != XSK_READY && state != XSK_BOUND) 1839 return -EBUSY; 1840 1841 if (offset == XDP_PGOFF_RX_RING) { 1842 q = READ_ONCE(xs->rx); 1843 } else if (offset == XDP_PGOFF_TX_RING) { 1844 q = READ_ONCE(xs->tx); 1845 } else { 1846 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1847 smp_rmb(); 1848 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1849 q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : 1850 READ_ONCE(xs->pool->fq); 1851 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1852 q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : 1853 READ_ONCE(xs->pool->cq); 1854 } 1855 1856 if (!q) 1857 return -EINVAL; 1858 1859 /* Matches the smp_wmb() in xsk_init_queue */ 1860 smp_rmb(); 1861 if (size > q->ring_vmalloc_size) 1862 return -EINVAL; 1863 1864 return remap_vmalloc_range(vma, q->ring, 0); 1865 } 1866 1867 static int xsk_notifier(struct notifier_block *this, 1868 unsigned long msg, void *ptr) 1869 { 1870 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1871 struct net *net = dev_net(dev); 1872 struct sock *sk; 1873 1874 switch (msg) { 1875 case NETDEV_UNREGISTER: 1876 mutex_lock(&net->xdp.lock); 1877 sk_for_each(sk, &net->xdp.list) { 1878 struct xdp_sock *xs = xdp_sk(sk); 1879 1880 mutex_lock(&xs->mutex); 1881 if (xs->dev == dev) { 1882 sk->sk_err = ENETDOWN; 1883 if (!sock_flag(sk, SOCK_DEAD)) 1884 sk_error_report(sk); 1885 1886 xsk_unbind_dev(xs); 1887 1888 /* Clear device references. */ 1889 xp_clear_dev(xs->pool); 1890 } 1891 mutex_unlock(&xs->mutex); 1892 } 1893 mutex_unlock(&net->xdp.lock); 1894 break; 1895 } 1896 return NOTIFY_DONE; 1897 } 1898 1899 static struct proto xsk_proto = { 1900 .name = "XDP", 1901 .owner = THIS_MODULE, 1902 .obj_size = sizeof(struct xdp_sock), 1903 }; 1904 1905 static const struct proto_ops xsk_proto_ops = { 1906 .family = PF_XDP, 1907 .owner = THIS_MODULE, 1908 .release = xsk_release, 1909 .bind = xsk_bind, 1910 .connect = sock_no_connect, 1911 .socketpair = sock_no_socketpair, 1912 .accept = sock_no_accept, 1913 .getname = sock_no_getname, 1914 .poll = xsk_poll, 1915 .ioctl = sock_no_ioctl, 1916 .listen = sock_no_listen, 1917 .shutdown = sock_no_shutdown, 1918 .setsockopt = xsk_setsockopt, 1919 .getsockopt = xsk_getsockopt, 1920 .sendmsg = xsk_sendmsg, 1921 .recvmsg = xsk_recvmsg, 1922 .mmap = xsk_mmap, 1923 }; 1924 1925 static void xsk_destruct(struct sock *sk) 1926 { 1927 struct xdp_sock *xs = xdp_sk(sk); 1928 1929 if (!sock_flag(sk, SOCK_DEAD)) 1930 return; 1931 1932 if (!xp_put_pool(xs->pool)) 1933 xdp_put_umem(xs->umem, !xs->pool); 1934 } 1935 1936 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1937 int kern) 1938 { 1939 struct xdp_sock *xs; 1940 struct sock *sk; 1941 1942 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1943 return -EPERM; 1944 if (sock->type != SOCK_RAW) 1945 return -ESOCKTNOSUPPORT; 1946 1947 if (protocol) 1948 return -EPROTONOSUPPORT; 1949 1950 sock->state = SS_UNCONNECTED; 1951 1952 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1953 if (!sk) 1954 return -ENOBUFS; 1955 1956 sock->ops = &xsk_proto_ops; 1957 1958 sock_init_data(sock, sk); 1959 1960 sk->sk_family = PF_XDP; 1961 1962 sk->sk_destruct = xsk_destruct; 1963 1964 sock_set_flag(sk, SOCK_RCU_FREE); 1965 1966 xs = xdp_sk(sk); 1967 xs->state = XSK_READY; 1968 xs->max_tx_budget = TX_BATCH_SIZE; 1969 mutex_init(&xs->mutex); 1970 1971 INIT_LIST_HEAD(&xs->map_list); 1972 spin_lock_init(&xs->map_list_lock); 1973 1974 mutex_lock(&net->xdp.lock); 1975 sk_add_node_rcu(sk, &net->xdp.list); 1976 mutex_unlock(&net->xdp.lock); 1977 1978 sock_prot_inuse_add(net, &xsk_proto, 1); 1979 1980 return 0; 1981 } 1982 1983 static const struct net_proto_family xsk_family_ops = { 1984 .family = PF_XDP, 1985 .create = xsk_create, 1986 .owner = THIS_MODULE, 1987 }; 1988 1989 static struct notifier_block xsk_netdev_notifier = { 1990 .notifier_call = xsk_notifier, 1991 }; 1992 1993 static int __net_init xsk_net_init(struct net *net) 1994 { 1995 mutex_init(&net->xdp.lock); 1996 INIT_HLIST_HEAD(&net->xdp.list); 1997 return 0; 1998 } 1999 2000 static void __net_exit xsk_net_exit(struct net *net) 2001 { 2002 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 2003 } 2004 2005 static struct pernet_operations xsk_net_ops = { 2006 .init = xsk_net_init, 2007 .exit = xsk_net_exit, 2008 }; 2009 2010 static int __init xsk_init(void) 2011 { 2012 int err; 2013 2014 err = proto_register(&xsk_proto, 0 /* no slab */); 2015 if (err) 2016 goto out; 2017 2018 err = sock_register(&xsk_family_ops); 2019 if (err) 2020 goto out_proto; 2021 2022 err = register_pernet_subsys(&xsk_net_ops); 2023 if (err) 2024 goto out_sk; 2025 2026 err = register_netdevice_notifier(&xsk_netdev_notifier); 2027 if (err) 2028 goto out_pernet; 2029 2030 xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", 2031 sizeof(struct xsk_addrs), 2032 0, SLAB_HWCACHE_ALIGN, NULL); 2033 if (!xsk_tx_generic_cache) { 2034 err = -ENOMEM; 2035 goto out_unreg_notif; 2036 } 2037 2038 return 0; 2039 2040 out_unreg_notif: 2041 unregister_netdevice_notifier(&xsk_netdev_notifier); 2042 out_pernet: 2043 unregister_pernet_subsys(&xsk_net_ops); 2044 out_sk: 2045 sock_unregister(PF_XDP); 2046 out_proto: 2047 proto_unregister(&xsk_proto); 2048 out: 2049 return err; 2050 } 2051 2052 fs_initcall(xsk_init); 2053