1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <linux/vmalloc.h> 26 27 #include <net/netdev_queues.h> 28 #include <net/xdp_sock_drv.h> 29 #include <net/busy_poll.h> 30 #include <net/netdev_lock.h> 31 #include <net/netdev_rx_queue.h> 32 #include <net/xdp.h> 33 34 #include "../core/dev.h" 35 36 #include "xsk_queue.h" 37 #include "xdp_umem.h" 38 #include "xsk.h" 39 40 #define TX_BATCH_SIZE 32 41 #define MAX_PER_SOCKET_BUDGET 32 42 43 struct xsk_addrs { 44 u32 num_descs; 45 u64 addrs[MAX_SKB_FRAGS + 1]; 46 }; 47 48 static struct kmem_cache *xsk_tx_generic_cache; 49 50 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 51 { 52 if (pool->cached_need_wakeup & XDP_WAKEUP_RX) 53 return; 54 55 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 56 pool->cached_need_wakeup |= XDP_WAKEUP_RX; 57 } 58 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 59 60 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) 61 { 62 struct xdp_sock *xs; 63 64 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 65 return; 66 67 rcu_read_lock(); 68 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 69 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 70 } 71 rcu_read_unlock(); 72 73 pool->cached_need_wakeup |= XDP_WAKEUP_TX; 74 } 75 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 76 77 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) 78 { 79 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) 80 return; 81 82 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 83 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; 84 } 85 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 86 87 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) 88 { 89 struct xdp_sock *xs; 90 91 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) 92 return; 93 94 rcu_read_lock(); 95 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 96 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 97 } 98 rcu_read_unlock(); 99 100 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; 101 } 102 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 103 104 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) 105 { 106 return pool->uses_need_wakeup; 107 } 108 EXPORT_SYMBOL(xsk_uses_need_wakeup); 109 110 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 111 u16 queue_id) 112 { 113 if (queue_id < dev->real_num_rx_queues) 114 return dev->_rx[queue_id].pool; 115 if (queue_id < dev->real_num_tx_queues) 116 return dev->_tx[queue_id].pool; 117 118 return NULL; 119 } 120 EXPORT_SYMBOL(xsk_get_pool_from_qid); 121 122 static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 123 { 124 if (queue_id < dev->num_rx_queues) 125 dev->_rx[queue_id].pool = NULL; 126 if (queue_id < dev->num_tx_queues) 127 dev->_tx[queue_id].pool = NULL; 128 } 129 130 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 131 { 132 struct netdev_rx_queue *hw_rxq; 133 134 if (!netif_rxq_is_leased(dev, queue_id)) 135 return __xsk_clear_pool_at_qid(dev, queue_id); 136 WARN_ON_ONCE(!netif_is_queue_leasee(dev)); 137 138 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 139 140 netdev_lock(hw_rxq->dev); 141 queue_id = get_netdev_rx_queue_index(hw_rxq); 142 __xsk_clear_pool_at_qid(hw_rxq->dev, queue_id); 143 netdev_unlock(hw_rxq->dev); 144 } 145 146 static int __xsk_reg_pool_at_qid(struct net_device *dev, 147 struct xsk_buff_pool *pool, u16 queue_id) 148 { 149 if (xsk_get_pool_from_qid(dev, queue_id)) 150 return -EBUSY; 151 152 if (queue_id < dev->real_num_rx_queues) 153 dev->_rx[queue_id].pool = pool; 154 if (queue_id < dev->real_num_tx_queues) 155 dev->_tx[queue_id].pool = pool; 156 157 return 0; 158 } 159 160 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do 161 * not know if the device has more tx queues than rx, or the opposite. 162 * This might also change during run time. 163 */ 164 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 165 u16 queue_id) 166 { 167 struct netdev_rx_queue *hw_rxq; 168 int ret; 169 170 if (queue_id >= max(dev->real_num_rx_queues, 171 dev->real_num_tx_queues)) 172 return -EINVAL; 173 174 if (queue_id >= dev->real_num_rx_queues || 175 !netif_rxq_is_leased(dev, queue_id)) 176 return __xsk_reg_pool_at_qid(dev, pool, queue_id); 177 if (!netif_is_queue_leasee(dev)) 178 return -EBUSY; 179 180 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 181 182 netdev_lock(hw_rxq->dev); 183 queue_id = get_netdev_rx_queue_index(hw_rxq); 184 ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id); 185 netdev_unlock(hw_rxq->dev); 186 187 return ret; 188 } 189 190 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, 191 u32 flags) 192 { 193 u64 addr; 194 int err; 195 196 addr = xp_get_handle(xskb, xskb->pool); 197 err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 198 if (err) { 199 xs->rx_queue_full++; 200 return err; 201 } 202 203 xp_release(xskb); 204 return 0; 205 } 206 207 static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, 208 u32 len, u32 flags) 209 { 210 u64 addr; 211 212 addr = xp_get_handle(xskb, xskb->pool); 213 __xskq_prod_reserve_desc(xs->rx, addr, len, flags); 214 215 xp_release(xskb); 216 } 217 218 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 219 { 220 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 221 u32 frags = xdp_buff_has_frags(xdp); 222 struct xdp_buff_xsk *pos, *tmp; 223 struct list_head *xskb_list; 224 u32 contd = 0; 225 u32 num_desc; 226 int err; 227 228 if (likely(!frags)) { 229 err = __xsk_rcv_zc(xs, xskb, len, contd); 230 if (err) 231 goto err; 232 return 0; 233 } 234 235 contd = XDP_PKT_CONTD; 236 num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; 237 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 238 xs->rx_queue_full++; 239 err = -ENOBUFS; 240 goto err; 241 } 242 243 __xsk_rcv_zc_safe(xs, xskb, len, contd); 244 xskb_list = &xskb->pool->xskb_list; 245 list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 246 if (list_is_singular(xskb_list)) 247 contd = 0; 248 len = pos->xdp.data_end - pos->xdp.data; 249 __xsk_rcv_zc_safe(xs, pos, len, contd); 250 list_del_init(&pos->list_node); 251 } 252 253 return 0; 254 err: 255 xsk_buff_free(xdp); 256 return err; 257 } 258 259 static void *xsk_copy_xdp_start(struct xdp_buff *from) 260 { 261 if (unlikely(xdp_data_meta_unsupported(from))) 262 return from->data; 263 else 264 return from->data_meta; 265 } 266 267 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, 268 u32 *from_len, skb_frag_t **frag, u32 rem) 269 { 270 u32 copied = 0; 271 272 while (1) { 273 u32 copy_len = min_t(u32, *from_len, to_len); 274 275 memcpy(to, *from, copy_len); 276 copied += copy_len; 277 if (rem == copied) 278 return copied; 279 280 if (*from_len == copy_len) { 281 *from = skb_frag_address(*frag); 282 *from_len = skb_frag_size((*frag)++); 283 } else { 284 *from += copy_len; 285 *from_len -= copy_len; 286 } 287 if (to_len == copy_len) 288 return copied; 289 290 to_len -= copy_len; 291 to += copy_len; 292 } 293 } 294 295 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 296 { 297 u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); 298 void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; 299 u32 from_len, meta_len, rem, num_desc; 300 struct xdp_buff_xsk *xskb; 301 struct xdp_buff *xsk_xdp; 302 skb_frag_t *frag; 303 304 from_len = xdp->data_end - copy_from; 305 meta_len = xdp->data - copy_from; 306 rem = len + meta_len; 307 308 if (len <= frame_size && !xdp_buff_has_frags(xdp)) { 309 int err; 310 311 xsk_xdp = xsk_buff_alloc(xs->pool); 312 if (!xsk_xdp) { 313 xs->rx_dropped++; 314 return -ENOMEM; 315 } 316 memcpy(xsk_xdp->data - meta_len, copy_from, rem); 317 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 318 err = __xsk_rcv_zc(xs, xskb, len, 0); 319 if (err) { 320 xsk_buff_free(xsk_xdp); 321 return err; 322 } 323 324 return 0; 325 } 326 327 num_desc = (len - 1) / frame_size + 1; 328 329 if (!xsk_buff_can_alloc(xs->pool, num_desc)) { 330 xs->rx_dropped++; 331 return -ENOMEM; 332 } 333 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 334 xs->rx_queue_full++; 335 return -ENOBUFS; 336 } 337 338 if (xdp_buff_has_frags(xdp)) { 339 struct skb_shared_info *sinfo; 340 341 sinfo = xdp_get_shared_info_from_buff(xdp); 342 frag = &sinfo->frags[0]; 343 } 344 345 do { 346 u32 to_len = frame_size + meta_len; 347 u32 copied; 348 349 xsk_xdp = xsk_buff_alloc(xs->pool); 350 copy_to = xsk_xdp->data - meta_len; 351 352 copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); 353 rem -= copied; 354 355 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 356 __xsk_rcv_zc_safe(xs, xskb, copied - meta_len, 357 rem ? XDP_PKT_CONTD : 0); 358 meta_len = 0; 359 } while (rem); 360 361 return 0; 362 } 363 364 static bool xsk_tx_writeable(struct xdp_sock *xs) 365 { 366 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) 367 return false; 368 369 return true; 370 } 371 372 static void __xsk_tx_release(struct xdp_sock *xs) 373 { 374 __xskq_cons_release(xs->tx); 375 if (xsk_tx_writeable(xs)) 376 xs->sk.sk_write_space(&xs->sk); 377 } 378 379 static bool xsk_is_bound(struct xdp_sock *xs) 380 { 381 if (READ_ONCE(xs->state) == XSK_BOUND) { 382 /* Matches smp_wmb() in bind(). */ 383 smp_rmb(); 384 return true; 385 } 386 return false; 387 } 388 389 static bool xsk_dev_queue_valid(const struct xdp_sock *xs, 390 const struct xdp_rxq_info *info) 391 { 392 struct net_device *dev = xs->dev; 393 u32 queue_index = xs->queue_id; 394 struct netdev_rx_queue *rxq; 395 396 if (info->dev == dev && 397 info->queue_index == queue_index) 398 return true; 399 400 if (queue_index < dev->real_num_rx_queues) { 401 rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); 402 if (!rxq) 403 return false; 404 405 dev = rxq->dev; 406 queue_index = get_netdev_rx_queue_index(rxq); 407 408 return info->dev == dev && 409 info->queue_index == queue_index; 410 } 411 return false; 412 } 413 414 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 415 { 416 if (!xsk_is_bound(xs)) 417 return -ENXIO; 418 if (!xsk_dev_queue_valid(xs, xdp->rxq)) 419 return -EINVAL; 420 421 if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 422 xs->rx_dropped++; 423 return -ENOSPC; 424 } 425 426 return 0; 427 } 428 429 static void xsk_flush(struct xdp_sock *xs) 430 { 431 xskq_prod_submit(xs->rx); 432 __xskq_cons_release(xs->pool->fq); 433 sock_def_readable(&xs->sk); 434 } 435 436 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 437 { 438 u32 len = xdp_get_buff_len(xdp); 439 int err; 440 441 err = xsk_rcv_check(xs, xdp, len); 442 if (!err) { 443 spin_lock_bh(&xs->pool->rx_lock); 444 err = __xsk_rcv(xs, xdp, len); 445 xsk_flush(xs); 446 spin_unlock_bh(&xs->pool->rx_lock); 447 } 448 449 return err; 450 } 451 452 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 453 { 454 u32 len = xdp_get_buff_len(xdp); 455 int err; 456 457 err = xsk_rcv_check(xs, xdp, len); 458 if (err) 459 return err; 460 461 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { 462 len = xdp->data_end - xdp->data; 463 return xsk_rcv_zc(xs, xdp, len); 464 } 465 466 err = __xsk_rcv(xs, xdp, len); 467 if (!err) 468 xdp_return_buff(xdp); 469 return err; 470 } 471 472 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 473 { 474 int err; 475 476 err = xsk_rcv(xs, xdp); 477 if (err) 478 return err; 479 480 if (!xs->flush_node.prev) { 481 struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); 482 483 list_add(&xs->flush_node, flush_list); 484 } 485 486 return 0; 487 } 488 489 void __xsk_map_flush(struct list_head *flush_list) 490 { 491 struct xdp_sock *xs, *tmp; 492 493 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 494 xsk_flush(xs); 495 __list_del_clearprev(&xs->flush_node); 496 } 497 } 498 499 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) 500 { 501 xskq_prod_submit_n(pool->cq, nb_entries); 502 } 503 EXPORT_SYMBOL(xsk_tx_completed); 504 505 void xsk_tx_release(struct xsk_buff_pool *pool) 506 { 507 struct xdp_sock *xs; 508 509 rcu_read_lock(); 510 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 511 __xsk_tx_release(xs); 512 rcu_read_unlock(); 513 } 514 EXPORT_SYMBOL(xsk_tx_release); 515 516 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 517 { 518 bool budget_exhausted = false; 519 struct xdp_sock *xs; 520 521 rcu_read_lock(); 522 again: 523 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 524 if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { 525 budget_exhausted = true; 526 continue; 527 } 528 529 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { 530 if (xskq_has_descs(xs->tx)) 531 xskq_cons_release(xs->tx); 532 continue; 533 } 534 535 xs->tx_budget_spent++; 536 537 /* This is the backpressure mechanism for the Tx path. 538 * Reserve space in the completion queue and only proceed 539 * if there is space in it. This avoids having to implement 540 * any buffering in the Tx path. 541 */ 542 if (xskq_prod_reserve_addr(pool->cq, desc->addr)) 543 goto out; 544 545 xskq_cons_release(xs->tx); 546 rcu_read_unlock(); 547 return true; 548 } 549 550 if (budget_exhausted) { 551 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 552 xs->tx_budget_spent = 0; 553 554 budget_exhausted = false; 555 goto again; 556 } 557 558 out: 559 rcu_read_unlock(); 560 return false; 561 } 562 EXPORT_SYMBOL(xsk_tx_peek_desc); 563 564 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) 565 { 566 struct xdp_desc *descs = pool->tx_descs; 567 u32 nb_pkts = 0; 568 569 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 570 nb_pkts++; 571 572 xsk_tx_release(pool); 573 return nb_pkts; 574 } 575 576 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) 577 { 578 struct xdp_sock *xs; 579 580 rcu_read_lock(); 581 if (!list_is_singular(&pool->xsk_tx_list)) { 582 /* Fallback to the non-batched version */ 583 rcu_read_unlock(); 584 return xsk_tx_peek_release_fallback(pool, nb_pkts); 585 } 586 587 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 588 if (!xs) { 589 nb_pkts = 0; 590 goto out; 591 } 592 593 nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); 594 595 /* This is the backpressure mechanism for the Tx path. Try to 596 * reserve space in the completion queue for all packets, but 597 * if there are fewer slots available, just process that many 598 * packets. This avoids having to implement any buffering in 599 * the Tx path. 600 */ 601 nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); 602 if (!nb_pkts) 603 goto out; 604 605 nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); 606 if (!nb_pkts) { 607 xs->tx->queue_empty_descs++; 608 goto out; 609 } 610 611 __xskq_cons_release(xs->tx); 612 xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); 613 xs->sk.sk_write_space(&xs->sk); 614 615 out: 616 rcu_read_unlock(); 617 return nb_pkts; 618 } 619 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 620 621 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 622 { 623 struct net_device *dev = xs->dev; 624 625 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 626 } 627 628 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) 629 { 630 int ret; 631 632 spin_lock(&pool->cq->cq_cached_prod_lock); 633 ret = xskq_prod_reserve(pool->cq); 634 spin_unlock(&pool->cq->cq_cached_prod_lock); 635 636 return ret; 637 } 638 639 static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) 640 { 641 return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; 642 } 643 644 static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) 645 { 646 return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); 647 } 648 649 static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) 650 { 651 struct xsk_addrs *xsk_addr; 652 653 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 654 if (unlikely(!xsk_addr)) 655 return NULL; 656 657 xsk_addr->addrs[0] = addr; 658 skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 659 return xsk_addr; 660 } 661 662 static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) 663 { 664 struct xsk_addrs *xsk_addr; 665 666 if (!xsk_skb_destructor_is_addr(skb)) 667 return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 668 669 xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); 670 if (likely(xsk_addr)) 671 xsk_addr->num_descs = 1; 672 return xsk_addr; 673 } 674 675 static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) 676 { 677 if (IS_ENABLED(CONFIG_64BIT)) { 678 skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); 679 return 0; 680 } 681 682 if (unlikely(!__xsk_addrs_alloc(skb, addr))) 683 return -ENOMEM; 684 return 0; 685 } 686 687 static void xsk_inc_num_desc(struct sk_buff *skb) 688 { 689 struct xsk_addrs *xsk_addr; 690 691 if (!xsk_skb_destructor_is_addr(skb)) { 692 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 693 xsk_addr->num_descs++; 694 } 695 } 696 697 static u32 xsk_get_num_desc(struct sk_buff *skb) 698 { 699 struct xsk_addrs *xsk_addr; 700 701 if (xsk_skb_destructor_is_addr(skb)) 702 return 1; 703 704 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 705 706 return xsk_addr->num_descs; 707 } 708 709 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, 710 struct sk_buff *skb) 711 { 712 u32 num_descs = xsk_get_num_desc(skb); 713 struct xsk_addrs *xsk_addr; 714 u32 descs_processed = 0; 715 unsigned long flags; 716 u32 idx, i; 717 718 spin_lock_irqsave(&pool->cq_prod_lock, flags); 719 idx = xskq_get_prod(pool->cq); 720 721 if (unlikely(!xsk_skb_destructor_is_addr(skb))) { 722 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 723 724 for (i = 0; i < num_descs; i++) { 725 xskq_prod_write_addr(pool->cq, idx + descs_processed, 726 xsk_addr->addrs[i]); 727 descs_processed++; 728 } 729 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 730 } else { 731 xskq_prod_write_addr(pool->cq, idx, 732 xsk_skb_destructor_get_addr(skb)); 733 descs_processed++; 734 } 735 xskq_prod_submit_n(pool->cq, descs_processed); 736 spin_unlock_irqrestore(&pool->cq_prod_lock, flags); 737 } 738 739 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) 740 { 741 spin_lock(&pool->cq->cq_cached_prod_lock); 742 xskq_prod_cancel_n(pool->cq, n); 743 spin_unlock(&pool->cq->cq_cached_prod_lock); 744 } 745 746 INDIRECT_CALLABLE_SCOPE 747 void xsk_destruct_skb(struct sk_buff *skb) 748 { 749 struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; 750 751 if (compl->tx_timestamp) { 752 /* sw completion timestamp, not a real one */ 753 *compl->tx_timestamp = ktime_get_tai_fast_ns(); 754 } 755 756 xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); 757 sock_wfree(skb); 758 } 759 760 static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, 761 u64 addr) 762 { 763 int err; 764 765 err = xsk_skb_destructor_set_addr(skb, addr); 766 if (unlikely(err)) 767 return err; 768 769 skb->dev = xs->dev; 770 skb->priority = READ_ONCE(xs->sk.sk_priority); 771 skb->mark = READ_ONCE(xs->sk.sk_mark); 772 skb->destructor = xsk_destruct_skb; 773 return 0; 774 } 775 776 static void xsk_consume_skb(struct sk_buff *skb) 777 { 778 struct xdp_sock *xs = xdp_sk(skb->sk); 779 u32 num_descs = xsk_get_num_desc(skb); 780 struct xsk_addrs *xsk_addr; 781 782 if (unlikely(!xsk_skb_destructor_is_addr(skb))) { 783 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 784 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 785 } 786 787 skb->destructor = sock_wfree; 788 xsk_cq_cancel_locked(xs->pool, num_descs); 789 /* Free skb without triggering the perf drop trace */ 790 consume_skb(skb); 791 xs->skb = NULL; 792 } 793 794 static void xsk_drop_skb(struct sk_buff *skb) 795 { 796 xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); 797 xsk_consume_skb(skb); 798 } 799 800 static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, 801 struct xdp_desc *desc, struct xsk_buff_pool *pool, 802 u32 hr) 803 { 804 struct xsk_tx_metadata *meta = NULL; 805 u16 csum_start, csum_offset; 806 807 if (unlikely(pool->tx_metadata_len == 0)) 808 return -EINVAL; 809 810 meta = buffer - pool->tx_metadata_len; 811 if (unlikely(!xsk_buff_valid_tx_metadata(meta))) 812 return -EINVAL; 813 814 if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { 815 csum_start = READ_ONCE(meta->request.csum_start); 816 csum_offset = READ_ONCE(meta->request.csum_offset); 817 818 if (unlikely(csum_start + csum_offset + 819 sizeof(__sum16) > desc->len)) 820 return -EINVAL; 821 822 skb->csum_start = hr + csum_start; 823 skb->csum_offset = csum_offset; 824 skb->ip_summed = CHECKSUM_PARTIAL; 825 826 if (unlikely(pool->tx_sw_csum)) { 827 int err; 828 829 err = skb_checksum_help(skb); 830 if (err) 831 return err; 832 } 833 } 834 835 if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) 836 skb->skb_mstamp_ns = meta->request.launch_time; 837 xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); 838 839 return 0; 840 } 841 842 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, 843 struct xdp_desc *desc) 844 { 845 struct xsk_buff_pool *pool = xs->pool; 846 u32 hr, len, ts, offset, copy, copied; 847 struct sk_buff *skb = xs->skb; 848 struct page *page; 849 void *buffer; 850 int err, i; 851 u64 addr; 852 853 addr = desc->addr; 854 buffer = xsk_buff_raw_get_data(pool, addr); 855 856 if (!skb) { 857 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); 858 859 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); 860 if (unlikely(!skb)) 861 return ERR_PTR(err); 862 863 skb_reserve(skb, hr); 864 if (desc->options & XDP_TX_METADATA) { 865 err = xsk_skb_metadata(skb, buffer, desc, pool, hr); 866 if (unlikely(err)) { 867 kfree_skb(skb); 868 return ERR_PTR(err); 869 } 870 } 871 } else { 872 struct xsk_addrs *xsk_addr; 873 874 xsk_addr = xsk_addrs_alloc(skb); 875 if (!xsk_addr) 876 return ERR_PTR(-ENOMEM); 877 878 /* in case of -EOVERFLOW that could happen below, 879 * xsk_consume_skb() will release this node as whole skb 880 * would be dropped, which implies freeing all list elements 881 */ 882 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 883 } 884 885 len = desc->len; 886 ts = pool->unaligned ? len : pool->chunk_size; 887 888 offset = offset_in_page(buffer); 889 addr = buffer - pool->addrs; 890 891 for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { 892 if (unlikely(i >= MAX_SKB_FRAGS)) { 893 if (!xs->skb) 894 kfree_skb(skb); 895 return ERR_PTR(-EOVERFLOW); 896 } 897 898 page = pool->umem->pgs[addr >> PAGE_SHIFT]; 899 get_page(page); 900 901 copy = min_t(u32, PAGE_SIZE - offset, len - copied); 902 skb_fill_page_desc(skb, i, page, offset, copy); 903 904 copied += copy; 905 addr += copy; 906 offset = 0; 907 } 908 909 skb->len += len; 910 skb->data_len += len; 911 skb->truesize += ts; 912 913 refcount_add(ts, &xs->sk.sk_wmem_alloc); 914 915 return skb; 916 } 917 918 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, 919 struct xdp_desc *desc) 920 { 921 struct net_device *dev = xs->dev; 922 struct sk_buff *skb = xs->skb; 923 int err; 924 925 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { 926 skb = xsk_build_skb_zerocopy(xs, desc); 927 if (IS_ERR(skb)) { 928 err = PTR_ERR(skb); 929 skb = NULL; 930 goto free_err; 931 } 932 } else { 933 u32 hr, tr, len; 934 void *buffer; 935 936 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); 937 len = desc->len; 938 939 if (!skb) { 940 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); 941 tr = dev->needed_tailroom; 942 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); 943 if (unlikely(!skb)) 944 goto free_err; 945 946 skb_reserve(skb, hr); 947 skb_put(skb, len); 948 949 err = skb_store_bits(skb, 0, buffer, len); 950 if (unlikely(err)) 951 goto free_err; 952 953 if (desc->options & XDP_TX_METADATA) { 954 err = xsk_skb_metadata(skb, buffer, desc, 955 xs->pool, hr); 956 if (unlikely(err)) 957 goto free_err; 958 } 959 } else { 960 int nr_frags = skb_shinfo(skb)->nr_frags; 961 struct xsk_addrs *xsk_addr; 962 struct page *page; 963 u8 *vaddr; 964 965 xsk_addr = xsk_addrs_alloc(skb); 966 if (!xsk_addr) { 967 err = -ENOMEM; 968 goto free_err; 969 } 970 971 if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 972 err = -EOVERFLOW; 973 goto free_err; 974 } 975 976 page = alloc_page(xs->sk.sk_allocation); 977 if (unlikely(!page)) { 978 err = -EAGAIN; 979 goto free_err; 980 } 981 982 vaddr = kmap_local_page(page); 983 memcpy(vaddr, buffer, len); 984 kunmap_local(vaddr); 985 986 skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 987 refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 988 989 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 990 } 991 } 992 993 if (!xs->skb) { 994 err = xsk_skb_init_misc(skb, xs, desc->addr); 995 if (unlikely(err)) 996 goto free_err; 997 } 998 xsk_inc_num_desc(skb); 999 1000 return skb; 1001 1002 free_err: 1003 if (skb && !xs->skb) 1004 kfree_skb(skb); 1005 1006 if (err == -EOVERFLOW) { 1007 if (xs->skb) { 1008 /* Drop the packet */ 1009 xsk_inc_num_desc(xs->skb); 1010 xsk_drop_skb(xs->skb); 1011 } else { 1012 xsk_cq_cancel_locked(xs->pool, 1); 1013 xs->tx->invalid_descs++; 1014 } 1015 xskq_cons_release(xs->tx); 1016 } else { 1017 /* Let application retry */ 1018 xsk_cq_cancel_locked(xs->pool, 1); 1019 } 1020 1021 return ERR_PTR(err); 1022 } 1023 1024 static int __xsk_generic_xmit(struct sock *sk) 1025 { 1026 struct xdp_sock *xs = xdp_sk(sk); 1027 bool sent_frame = false; 1028 struct xdp_desc desc; 1029 struct sk_buff *skb; 1030 u32 max_batch; 1031 int err = 0; 1032 1033 mutex_lock(&xs->mutex); 1034 1035 /* Since we dropped the RCU read lock, the socket state might have changed. */ 1036 if (unlikely(!xsk_is_bound(xs))) { 1037 err = -ENXIO; 1038 goto out; 1039 } 1040 1041 if (xs->queue_id >= xs->dev->real_num_tx_queues) 1042 goto out; 1043 1044 max_batch = READ_ONCE(xs->max_tx_budget); 1045 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { 1046 if (max_batch-- == 0) { 1047 err = -EAGAIN; 1048 goto out; 1049 } 1050 1051 /* This is the backpressure mechanism for the Tx path. 1052 * Reserve space in the completion queue and only proceed 1053 * if there is space in it. This avoids having to implement 1054 * any buffering in the Tx path. 1055 */ 1056 err = xsk_cq_reserve_locked(xs->pool); 1057 if (err) { 1058 err = -EAGAIN; 1059 goto out; 1060 } 1061 1062 skb = xsk_build_skb(xs, &desc); 1063 if (IS_ERR(skb)) { 1064 err = PTR_ERR(skb); 1065 if (err != -EOVERFLOW) 1066 goto out; 1067 err = 0; 1068 continue; 1069 } 1070 1071 xskq_cons_release(xs->tx); 1072 1073 if (xp_mb_desc(&desc)) { 1074 xs->skb = skb; 1075 continue; 1076 } 1077 1078 err = __dev_direct_xmit(skb, xs->queue_id); 1079 if (err == NETDEV_TX_BUSY) { 1080 /* Tell user-space to retry the send */ 1081 xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); 1082 xsk_consume_skb(skb); 1083 err = -EAGAIN; 1084 goto out; 1085 } 1086 1087 /* Ignore NET_XMIT_CN as packet might have been sent */ 1088 if (err == NET_XMIT_DROP) { 1089 /* SKB completed but not sent */ 1090 err = -EBUSY; 1091 xs->skb = NULL; 1092 goto out; 1093 } 1094 1095 sent_frame = true; 1096 xs->skb = NULL; 1097 } 1098 1099 if (xskq_has_descs(xs->tx)) { 1100 if (xs->skb) 1101 xsk_drop_skb(xs->skb); 1102 xskq_cons_release(xs->tx); 1103 } 1104 1105 out: 1106 if (sent_frame) 1107 __xsk_tx_release(xs); 1108 1109 mutex_unlock(&xs->mutex); 1110 return err; 1111 } 1112 1113 static int xsk_generic_xmit(struct sock *sk) 1114 { 1115 int ret; 1116 1117 /* Drop the RCU lock since the SKB path might sleep. */ 1118 rcu_read_unlock(); 1119 ret = __xsk_generic_xmit(sk); 1120 /* Reaquire RCU lock before going into common code. */ 1121 rcu_read_lock(); 1122 1123 return ret; 1124 } 1125 1126 static bool xsk_no_wakeup(struct sock *sk) 1127 { 1128 #ifdef CONFIG_NET_RX_BUSY_POLL 1129 /* Prefer busy-polling, skip the wakeup. */ 1130 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 1131 napi_id_valid(READ_ONCE(sk->sk_napi_id)); 1132 #else 1133 return false; 1134 #endif 1135 } 1136 1137 static int xsk_check_common(struct xdp_sock *xs) 1138 { 1139 if (unlikely(!xsk_is_bound(xs))) 1140 return -ENXIO; 1141 if (unlikely(!(xs->dev->flags & IFF_UP))) 1142 return -ENETDOWN; 1143 1144 return 0; 1145 } 1146 1147 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1148 { 1149 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 1150 struct sock *sk = sock->sk; 1151 struct xdp_sock *xs = xdp_sk(sk); 1152 struct xsk_buff_pool *pool; 1153 int err; 1154 1155 err = xsk_check_common(xs); 1156 if (err) 1157 return err; 1158 if (unlikely(need_wait)) 1159 return -EOPNOTSUPP; 1160 if (unlikely(!xs->tx)) 1161 return -ENOBUFS; 1162 1163 if (sk_can_busy_loop(sk)) 1164 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1165 1166 if (xs->zc && xsk_no_wakeup(sk)) 1167 return 0; 1168 1169 pool = xs->pool; 1170 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { 1171 if (xs->zc) 1172 return xsk_wakeup(xs, XDP_WAKEUP_TX); 1173 return xsk_generic_xmit(sk); 1174 } 1175 return 0; 1176 } 1177 1178 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1179 { 1180 int ret; 1181 1182 rcu_read_lock(); 1183 ret = __xsk_sendmsg(sock, m, total_len); 1184 rcu_read_unlock(); 1185 1186 return ret; 1187 } 1188 1189 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1190 { 1191 bool need_wait = !(flags & MSG_DONTWAIT); 1192 struct sock *sk = sock->sk; 1193 struct xdp_sock *xs = xdp_sk(sk); 1194 int err; 1195 1196 err = xsk_check_common(xs); 1197 if (err) 1198 return err; 1199 if (unlikely(!xs->rx)) 1200 return -ENOBUFS; 1201 if (unlikely(need_wait)) 1202 return -EOPNOTSUPP; 1203 1204 if (sk_can_busy_loop(sk)) 1205 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1206 1207 if (xsk_no_wakeup(sk)) 1208 return 0; 1209 1210 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 1211 return xsk_wakeup(xs, XDP_WAKEUP_RX); 1212 return 0; 1213 } 1214 1215 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1216 { 1217 int ret; 1218 1219 rcu_read_lock(); 1220 ret = __xsk_recvmsg(sock, m, len, flags); 1221 rcu_read_unlock(); 1222 1223 return ret; 1224 } 1225 1226 static __poll_t xsk_poll(struct file *file, struct socket *sock, 1227 struct poll_table_struct *wait) 1228 { 1229 __poll_t mask = 0; 1230 struct sock *sk = sock->sk; 1231 struct xdp_sock *xs = xdp_sk(sk); 1232 struct xsk_buff_pool *pool; 1233 1234 sock_poll_wait(file, sock, wait); 1235 1236 rcu_read_lock(); 1237 if (xsk_check_common(xs)) 1238 goto out; 1239 1240 pool = xs->pool; 1241 1242 if (pool->cached_need_wakeup) { 1243 if (xs->zc) 1244 xsk_wakeup(xs, pool->cached_need_wakeup); 1245 else if (xs->tx) 1246 /* Poll needs to drive Tx also in copy mode */ 1247 xsk_generic_xmit(sk); 1248 } 1249 1250 if (xs->rx && !xskq_prod_is_empty(xs->rx)) 1251 mask |= EPOLLIN | EPOLLRDNORM; 1252 if (xs->tx && xsk_tx_writeable(xs)) 1253 mask |= EPOLLOUT | EPOLLWRNORM; 1254 out: 1255 rcu_read_unlock(); 1256 return mask; 1257 } 1258 1259 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 1260 bool umem_queue) 1261 { 1262 struct xsk_queue *q; 1263 1264 if (entries == 0 || *queue || !is_power_of_2(entries)) 1265 return -EINVAL; 1266 1267 q = xskq_create(entries, umem_queue); 1268 if (!q) 1269 return -ENOMEM; 1270 1271 /* Make sure queue is ready before it can be seen by others */ 1272 smp_wmb(); 1273 WRITE_ONCE(*queue, q); 1274 return 0; 1275 } 1276 1277 static void xsk_unbind_dev(struct xdp_sock *xs) 1278 { 1279 struct net_device *dev = xs->dev; 1280 1281 if (xs->state != XSK_BOUND) 1282 return; 1283 WRITE_ONCE(xs->state, XSK_UNBOUND); 1284 1285 /* Wait for driver to stop using the xdp socket. */ 1286 xp_del_xsk(xs->pool, xs); 1287 synchronize_net(); 1288 dev_put(dev); 1289 } 1290 1291 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 1292 struct xdp_sock __rcu ***map_entry) 1293 { 1294 struct xsk_map *map = NULL; 1295 struct xsk_map_node *node; 1296 1297 *map_entry = NULL; 1298 1299 spin_lock_bh(&xs->map_list_lock); 1300 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 1301 node); 1302 if (node) { 1303 bpf_map_inc(&node->map->map); 1304 map = node->map; 1305 *map_entry = node->map_entry; 1306 } 1307 spin_unlock_bh(&xs->map_list_lock); 1308 return map; 1309 } 1310 1311 static void xsk_delete_from_maps(struct xdp_sock *xs) 1312 { 1313 /* This function removes the current XDP socket from all the 1314 * maps it resides in. We need to take extra care here, due to 1315 * the two locks involved. Each map has a lock synchronizing 1316 * updates to the entries, and each socket has a lock that 1317 * synchronizes access to the list of maps (map_list). For 1318 * deadlock avoidance the locks need to be taken in the order 1319 * "map lock"->"socket map list lock". We start off by 1320 * accessing the socket map list, and take a reference to the 1321 * map to guarantee existence between the 1322 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 1323 * calls. Then we ask the map to remove the socket, which 1324 * tries to remove the socket from the map. Note that there 1325 * might be updates to the map between 1326 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 1327 */ 1328 struct xdp_sock __rcu **map_entry = NULL; 1329 struct xsk_map *map; 1330 1331 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 1332 xsk_map_try_sock_delete(map, xs, map_entry); 1333 bpf_map_put(&map->map); 1334 } 1335 } 1336 1337 static int xsk_release(struct socket *sock) 1338 { 1339 struct sock *sk = sock->sk; 1340 struct xdp_sock *xs = xdp_sk(sk); 1341 struct net *net; 1342 1343 if (!sk) 1344 return 0; 1345 1346 net = sock_net(sk); 1347 1348 if (xs->skb) 1349 xsk_drop_skb(xs->skb); 1350 1351 mutex_lock(&net->xdp.lock); 1352 sk_del_node_init_rcu(sk); 1353 mutex_unlock(&net->xdp.lock); 1354 1355 sock_prot_inuse_add(net, sk->sk_prot, -1); 1356 1357 xsk_delete_from_maps(xs); 1358 mutex_lock(&xs->mutex); 1359 xsk_unbind_dev(xs); 1360 mutex_unlock(&xs->mutex); 1361 1362 xskq_destroy(xs->rx); 1363 xskq_destroy(xs->tx); 1364 xskq_destroy(xs->fq_tmp); 1365 xskq_destroy(xs->cq_tmp); 1366 1367 sock_orphan(sk); 1368 sock->sk = NULL; 1369 1370 sock_put(sk); 1371 1372 return 0; 1373 } 1374 1375 static struct socket *xsk_lookup_xsk_from_fd(int fd) 1376 { 1377 struct socket *sock; 1378 int err; 1379 1380 sock = sockfd_lookup(fd, &err); 1381 if (!sock) 1382 return ERR_PTR(-ENOTSOCK); 1383 1384 if (sock->sk->sk_family != PF_XDP) { 1385 sockfd_put(sock); 1386 return ERR_PTR(-ENOPROTOOPT); 1387 } 1388 1389 return sock; 1390 } 1391 1392 static bool xsk_validate_queues(struct xdp_sock *xs) 1393 { 1394 return xs->fq_tmp && xs->cq_tmp; 1395 } 1396 1397 static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) 1398 { 1399 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 1400 struct sock *sk = sock->sk; 1401 struct xdp_sock *xs = xdp_sk(sk); 1402 struct net_device *dev; 1403 int bound_dev_if; 1404 u32 flags, qid; 1405 int err = 0; 1406 1407 if (addr_len < sizeof(struct sockaddr_xdp)) 1408 return -EINVAL; 1409 if (sxdp->sxdp_family != AF_XDP) 1410 return -EINVAL; 1411 1412 flags = sxdp->sxdp_flags; 1413 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 1414 XDP_USE_NEED_WAKEUP | XDP_USE_SG)) 1415 return -EINVAL; 1416 1417 bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 1418 if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) 1419 return -EINVAL; 1420 1421 rtnl_lock(); 1422 mutex_lock(&xs->mutex); 1423 if (xs->state != XSK_READY) { 1424 err = -EBUSY; 1425 goto out_release; 1426 } 1427 1428 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 1429 if (!dev) { 1430 err = -ENODEV; 1431 goto out_release; 1432 } 1433 1434 netdev_lock_ops(dev); 1435 1436 if (!xs->rx && !xs->tx) { 1437 err = -EINVAL; 1438 goto out_unlock; 1439 } 1440 1441 qid = sxdp->sxdp_queue_id; 1442 1443 if (flags & XDP_SHARED_UMEM) { 1444 struct xdp_sock *umem_xs; 1445 struct socket *sock; 1446 1447 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 1448 (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { 1449 /* Cannot specify flags for shared sockets. */ 1450 err = -EINVAL; 1451 goto out_unlock; 1452 } 1453 1454 if (xs->umem) { 1455 /* We have already our own. */ 1456 err = -EINVAL; 1457 goto out_unlock; 1458 } 1459 1460 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 1461 if (IS_ERR(sock)) { 1462 err = PTR_ERR(sock); 1463 goto out_unlock; 1464 } 1465 1466 umem_xs = xdp_sk(sock->sk); 1467 if (!xsk_is_bound(umem_xs)) { 1468 err = -EBADF; 1469 sockfd_put(sock); 1470 goto out_unlock; 1471 } 1472 1473 if (umem_xs->queue_id != qid || umem_xs->dev != dev) { 1474 /* One fill and completion ring required for each queue id. */ 1475 if (!xsk_validate_queues(xs)) { 1476 err = -EINVAL; 1477 sockfd_put(sock); 1478 goto out_unlock; 1479 } 1480 1481 /* Share the umem with another socket on another qid 1482 * and/or device. 1483 */ 1484 xs->pool = xp_create_and_assign_umem(xs, 1485 umem_xs->umem); 1486 if (!xs->pool) { 1487 err = -ENOMEM; 1488 sockfd_put(sock); 1489 goto out_unlock; 1490 } 1491 1492 err = xp_assign_dev_shared(xs->pool, umem_xs, dev, 1493 qid); 1494 if (err) { 1495 xp_destroy(xs->pool); 1496 xs->pool = NULL; 1497 sockfd_put(sock); 1498 goto out_unlock; 1499 } 1500 } else { 1501 /* Share the buffer pool with the other socket. */ 1502 if (xs->fq_tmp || xs->cq_tmp) { 1503 /* Do not allow setting your own fq or cq. */ 1504 err = -EINVAL; 1505 sockfd_put(sock); 1506 goto out_unlock; 1507 } 1508 1509 xp_get_pool(umem_xs->pool); 1510 xs->pool = umem_xs->pool; 1511 1512 /* If underlying shared umem was created without Tx 1513 * ring, allocate Tx descs array that Tx batching API 1514 * utilizes 1515 */ 1516 if (xs->tx && !xs->pool->tx_descs) { 1517 err = xp_alloc_tx_descs(xs->pool, xs); 1518 if (err) { 1519 xp_put_pool(xs->pool); 1520 xs->pool = NULL; 1521 sockfd_put(sock); 1522 goto out_unlock; 1523 } 1524 } 1525 } 1526 1527 xdp_get_umem(umem_xs->umem); 1528 WRITE_ONCE(xs->umem, umem_xs->umem); 1529 sockfd_put(sock); 1530 } else if (!xs->umem || !xsk_validate_queues(xs)) { 1531 err = -EINVAL; 1532 goto out_unlock; 1533 } else { 1534 /* This xsk has its own umem. */ 1535 xs->pool = xp_create_and_assign_umem(xs, xs->umem); 1536 if (!xs->pool) { 1537 err = -ENOMEM; 1538 goto out_unlock; 1539 } 1540 1541 err = xp_assign_dev(xs->pool, dev, qid, flags); 1542 if (err) { 1543 xp_destroy(xs->pool); 1544 xs->pool = NULL; 1545 goto out_unlock; 1546 } 1547 } 1548 1549 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 1550 xs->fq_tmp = NULL; 1551 xs->cq_tmp = NULL; 1552 1553 xs->dev = dev; 1554 xs->zc = xs->umem->zc; 1555 xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); 1556 xs->queue_id = qid; 1557 xp_add_xsk(xs->pool, xs); 1558 1559 if (qid < dev->real_num_rx_queues) { 1560 struct netdev_rx_queue *rxq; 1561 1562 rxq = __netif_get_rx_queue(dev, qid); 1563 if (rxq->napi) 1564 __sk_mark_napi_id_once(sk, rxq->napi->napi_id); 1565 } 1566 1567 out_unlock: 1568 if (err) { 1569 dev_put(dev); 1570 } else { 1571 /* Matches smp_rmb() in bind() for shared umem 1572 * sockets, and xsk_is_bound(). 1573 */ 1574 smp_wmb(); 1575 WRITE_ONCE(xs->state, XSK_BOUND); 1576 } 1577 netdev_unlock_ops(dev); 1578 out_release: 1579 mutex_unlock(&xs->mutex); 1580 rtnl_unlock(); 1581 return err; 1582 } 1583 1584 struct xdp_umem_reg_v1 { 1585 __u64 addr; /* Start of packet data area */ 1586 __u64 len; /* Length of packet data area */ 1587 __u32 chunk_size; 1588 __u32 headroom; 1589 }; 1590 1591 static int xsk_setsockopt(struct socket *sock, int level, int optname, 1592 sockptr_t optval, unsigned int optlen) 1593 { 1594 struct sock *sk = sock->sk; 1595 struct xdp_sock *xs = xdp_sk(sk); 1596 int err; 1597 1598 if (level != SOL_XDP) 1599 return -ENOPROTOOPT; 1600 1601 switch (optname) { 1602 case XDP_RX_RING: 1603 case XDP_TX_RING: 1604 { 1605 struct xsk_queue **q; 1606 int entries; 1607 1608 if (optlen < sizeof(entries)) 1609 return -EINVAL; 1610 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1611 return -EFAULT; 1612 1613 mutex_lock(&xs->mutex); 1614 if (xs->state != XSK_READY) { 1615 mutex_unlock(&xs->mutex); 1616 return -EBUSY; 1617 } 1618 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 1619 err = xsk_init_queue(entries, q, false); 1620 if (!err && optname == XDP_TX_RING) 1621 /* Tx needs to be explicitly woken up the first time */ 1622 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 1623 mutex_unlock(&xs->mutex); 1624 return err; 1625 } 1626 case XDP_UMEM_REG: 1627 { 1628 size_t mr_size = sizeof(struct xdp_umem_reg); 1629 struct xdp_umem_reg mr = {}; 1630 struct xdp_umem *umem; 1631 1632 if (optlen < sizeof(struct xdp_umem_reg_v1)) 1633 return -EINVAL; 1634 else if (optlen < sizeof(mr)) 1635 mr_size = sizeof(struct xdp_umem_reg_v1); 1636 1637 BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); 1638 1639 /* Make sure the last field of the struct doesn't have 1640 * uninitialized padding. All padding has to be explicit 1641 * and has to be set to zero by the userspace to make 1642 * struct xdp_umem_reg extensible in the future. 1643 */ 1644 BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + 1645 sizeof_field(struct xdp_umem_reg, tx_metadata_len) != 1646 sizeof(struct xdp_umem_reg)); 1647 1648 if (copy_from_sockptr(&mr, optval, mr_size)) 1649 return -EFAULT; 1650 1651 mutex_lock(&xs->mutex); 1652 if (xs->state != XSK_READY || xs->umem) { 1653 mutex_unlock(&xs->mutex); 1654 return -EBUSY; 1655 } 1656 1657 umem = xdp_umem_create(&mr); 1658 if (IS_ERR(umem)) { 1659 mutex_unlock(&xs->mutex); 1660 return PTR_ERR(umem); 1661 } 1662 1663 /* Make sure umem is ready before it can be seen by others */ 1664 smp_wmb(); 1665 WRITE_ONCE(xs->umem, umem); 1666 mutex_unlock(&xs->mutex); 1667 return 0; 1668 } 1669 case XDP_UMEM_FILL_RING: 1670 case XDP_UMEM_COMPLETION_RING: 1671 { 1672 struct xsk_queue **q; 1673 int entries; 1674 1675 if (optlen < sizeof(entries)) 1676 return -EINVAL; 1677 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1678 return -EFAULT; 1679 1680 mutex_lock(&xs->mutex); 1681 if (xs->state != XSK_READY) { 1682 mutex_unlock(&xs->mutex); 1683 return -EBUSY; 1684 } 1685 1686 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : 1687 &xs->cq_tmp; 1688 err = xsk_init_queue(entries, q, true); 1689 mutex_unlock(&xs->mutex); 1690 return err; 1691 } 1692 case XDP_MAX_TX_SKB_BUDGET: 1693 { 1694 unsigned int budget; 1695 1696 if (optlen != sizeof(budget)) 1697 return -EINVAL; 1698 if (copy_from_sockptr(&budget, optval, sizeof(budget))) 1699 return -EFAULT; 1700 if (!xs->tx || 1701 budget < TX_BATCH_SIZE || budget > xs->tx->nentries) 1702 return -EACCES; 1703 1704 WRITE_ONCE(xs->max_tx_budget, budget); 1705 return 0; 1706 } 1707 default: 1708 break; 1709 } 1710 1711 return -ENOPROTOOPT; 1712 } 1713 1714 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 1715 { 1716 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 1717 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 1718 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 1719 } 1720 1721 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 1722 { 1723 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 1724 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 1725 ring->desc = offsetof(struct xdp_umem_ring, desc); 1726 } 1727 1728 struct xdp_statistics_v1 { 1729 __u64 rx_dropped; 1730 __u64 rx_invalid_descs; 1731 __u64 tx_invalid_descs; 1732 }; 1733 1734 static int xsk_getsockopt(struct socket *sock, int level, int optname, 1735 char __user *optval, int __user *optlen) 1736 { 1737 struct sock *sk = sock->sk; 1738 struct xdp_sock *xs = xdp_sk(sk); 1739 int len; 1740 1741 if (level != SOL_XDP) 1742 return -ENOPROTOOPT; 1743 1744 if (get_user(len, optlen)) 1745 return -EFAULT; 1746 if (len < 0) 1747 return -EINVAL; 1748 1749 switch (optname) { 1750 case XDP_STATISTICS: 1751 { 1752 struct xdp_statistics stats = {}; 1753 bool extra_stats = true; 1754 size_t stats_size; 1755 1756 if (len < sizeof(struct xdp_statistics_v1)) { 1757 return -EINVAL; 1758 } else if (len < sizeof(stats)) { 1759 extra_stats = false; 1760 stats_size = sizeof(struct xdp_statistics_v1); 1761 } else { 1762 stats_size = sizeof(stats); 1763 } 1764 1765 mutex_lock(&xs->mutex); 1766 stats.rx_dropped = xs->rx_dropped; 1767 if (extra_stats) { 1768 stats.rx_ring_full = xs->rx_queue_full; 1769 stats.rx_fill_ring_empty_descs = 1770 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; 1771 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); 1772 } else { 1773 stats.rx_dropped += xs->rx_queue_full; 1774 } 1775 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 1776 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 1777 mutex_unlock(&xs->mutex); 1778 1779 if (copy_to_user(optval, &stats, stats_size)) 1780 return -EFAULT; 1781 if (put_user(stats_size, optlen)) 1782 return -EFAULT; 1783 1784 return 0; 1785 } 1786 case XDP_MMAP_OFFSETS: 1787 { 1788 struct xdp_mmap_offsets off; 1789 struct xdp_mmap_offsets_v1 off_v1; 1790 bool flags_supported = true; 1791 void *to_copy; 1792 1793 if (len < sizeof(off_v1)) 1794 return -EINVAL; 1795 else if (len < sizeof(off)) 1796 flags_supported = false; 1797 1798 if (flags_supported) { 1799 /* xdp_ring_offset is identical to xdp_ring_offset_v1 1800 * except for the flags field added to the end. 1801 */ 1802 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1803 &off.rx); 1804 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1805 &off.tx); 1806 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1807 &off.fr); 1808 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1809 &off.cr); 1810 off.rx.flags = offsetof(struct xdp_rxtx_ring, 1811 ptrs.flags); 1812 off.tx.flags = offsetof(struct xdp_rxtx_ring, 1813 ptrs.flags); 1814 off.fr.flags = offsetof(struct xdp_umem_ring, 1815 ptrs.flags); 1816 off.cr.flags = offsetof(struct xdp_umem_ring, 1817 ptrs.flags); 1818 1819 len = sizeof(off); 1820 to_copy = &off; 1821 } else { 1822 xsk_enter_rxtx_offsets(&off_v1.rx); 1823 xsk_enter_rxtx_offsets(&off_v1.tx); 1824 xsk_enter_umem_offsets(&off_v1.fr); 1825 xsk_enter_umem_offsets(&off_v1.cr); 1826 1827 len = sizeof(off_v1); 1828 to_copy = &off_v1; 1829 } 1830 1831 if (copy_to_user(optval, to_copy, len)) 1832 return -EFAULT; 1833 if (put_user(len, optlen)) 1834 return -EFAULT; 1835 1836 return 0; 1837 } 1838 case XDP_OPTIONS: 1839 { 1840 struct xdp_options opts = {}; 1841 1842 if (len < sizeof(opts)) 1843 return -EINVAL; 1844 1845 mutex_lock(&xs->mutex); 1846 if (xs->zc) 1847 opts.flags |= XDP_OPTIONS_ZEROCOPY; 1848 mutex_unlock(&xs->mutex); 1849 1850 len = sizeof(opts); 1851 if (copy_to_user(optval, &opts, len)) 1852 return -EFAULT; 1853 if (put_user(len, optlen)) 1854 return -EFAULT; 1855 1856 return 0; 1857 } 1858 default: 1859 break; 1860 } 1861 1862 return -EOPNOTSUPP; 1863 } 1864 1865 static int xsk_mmap(struct file *file, struct socket *sock, 1866 struct vm_area_struct *vma) 1867 { 1868 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 1869 unsigned long size = vma->vm_end - vma->vm_start; 1870 struct xdp_sock *xs = xdp_sk(sock->sk); 1871 int state = READ_ONCE(xs->state); 1872 struct xsk_queue *q = NULL; 1873 1874 if (state != XSK_READY && state != XSK_BOUND) 1875 return -EBUSY; 1876 1877 if (offset == XDP_PGOFF_RX_RING) { 1878 q = READ_ONCE(xs->rx); 1879 } else if (offset == XDP_PGOFF_TX_RING) { 1880 q = READ_ONCE(xs->tx); 1881 } else { 1882 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1883 smp_rmb(); 1884 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1885 q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : 1886 READ_ONCE(xs->pool->fq); 1887 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1888 q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : 1889 READ_ONCE(xs->pool->cq); 1890 } 1891 1892 if (!q) 1893 return -EINVAL; 1894 1895 /* Matches the smp_wmb() in xsk_init_queue */ 1896 smp_rmb(); 1897 if (size > q->ring_vmalloc_size) 1898 return -EINVAL; 1899 1900 return remap_vmalloc_range(vma, q->ring, 0); 1901 } 1902 1903 static int xsk_notifier(struct notifier_block *this, 1904 unsigned long msg, void *ptr) 1905 { 1906 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1907 struct net *net = dev_net(dev); 1908 struct sock *sk; 1909 1910 switch (msg) { 1911 case NETDEV_UNREGISTER: 1912 mutex_lock(&net->xdp.lock); 1913 sk_for_each(sk, &net->xdp.list) { 1914 struct xdp_sock *xs = xdp_sk(sk); 1915 1916 mutex_lock(&xs->mutex); 1917 if (xs->dev == dev) { 1918 sk->sk_err = ENETDOWN; 1919 if (!sock_flag(sk, SOCK_DEAD)) 1920 sk_error_report(sk); 1921 1922 xsk_unbind_dev(xs); 1923 1924 /* Clear device references. */ 1925 xp_clear_dev(xs->pool); 1926 } 1927 mutex_unlock(&xs->mutex); 1928 } 1929 mutex_unlock(&net->xdp.lock); 1930 break; 1931 } 1932 return NOTIFY_DONE; 1933 } 1934 1935 static struct proto xsk_proto = { 1936 .name = "XDP", 1937 .owner = THIS_MODULE, 1938 .obj_size = sizeof(struct xdp_sock), 1939 }; 1940 1941 static const struct proto_ops xsk_proto_ops = { 1942 .family = PF_XDP, 1943 .owner = THIS_MODULE, 1944 .release = xsk_release, 1945 .bind = xsk_bind, 1946 .connect = sock_no_connect, 1947 .socketpair = sock_no_socketpair, 1948 .accept = sock_no_accept, 1949 .getname = sock_no_getname, 1950 .poll = xsk_poll, 1951 .ioctl = sock_no_ioctl, 1952 .listen = sock_no_listen, 1953 .shutdown = sock_no_shutdown, 1954 .setsockopt = xsk_setsockopt, 1955 .getsockopt = xsk_getsockopt, 1956 .sendmsg = xsk_sendmsg, 1957 .recvmsg = xsk_recvmsg, 1958 .mmap = xsk_mmap, 1959 }; 1960 1961 static void xsk_destruct(struct sock *sk) 1962 { 1963 struct xdp_sock *xs = xdp_sk(sk); 1964 1965 if (!sock_flag(sk, SOCK_DEAD)) 1966 return; 1967 1968 if (!xp_put_pool(xs->pool)) 1969 xdp_put_umem(xs->umem, !xs->pool); 1970 } 1971 1972 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1973 int kern) 1974 { 1975 struct xdp_sock *xs; 1976 struct sock *sk; 1977 1978 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1979 return -EPERM; 1980 if (sock->type != SOCK_RAW) 1981 return -ESOCKTNOSUPPORT; 1982 1983 if (protocol) 1984 return -EPROTONOSUPPORT; 1985 1986 sock->state = SS_UNCONNECTED; 1987 1988 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1989 if (!sk) 1990 return -ENOBUFS; 1991 1992 sock->ops = &xsk_proto_ops; 1993 1994 sock_init_data(sock, sk); 1995 1996 sk->sk_family = PF_XDP; 1997 1998 sk->sk_destruct = xsk_destruct; 1999 2000 sock_set_flag(sk, SOCK_RCU_FREE); 2001 2002 xs = xdp_sk(sk); 2003 xs->state = XSK_READY; 2004 xs->max_tx_budget = TX_BATCH_SIZE; 2005 mutex_init(&xs->mutex); 2006 2007 INIT_LIST_HEAD(&xs->map_list); 2008 spin_lock_init(&xs->map_list_lock); 2009 2010 mutex_lock(&net->xdp.lock); 2011 sk_add_node_rcu(sk, &net->xdp.list); 2012 mutex_unlock(&net->xdp.lock); 2013 2014 sock_prot_inuse_add(net, &xsk_proto, 1); 2015 2016 return 0; 2017 } 2018 2019 static const struct net_proto_family xsk_family_ops = { 2020 .family = PF_XDP, 2021 .create = xsk_create, 2022 .owner = THIS_MODULE, 2023 }; 2024 2025 static struct notifier_block xsk_netdev_notifier = { 2026 .notifier_call = xsk_notifier, 2027 }; 2028 2029 static int __net_init xsk_net_init(struct net *net) 2030 { 2031 mutex_init(&net->xdp.lock); 2032 INIT_HLIST_HEAD(&net->xdp.list); 2033 return 0; 2034 } 2035 2036 static void __net_exit xsk_net_exit(struct net *net) 2037 { 2038 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 2039 } 2040 2041 static struct pernet_operations xsk_net_ops = { 2042 .init = xsk_net_init, 2043 .exit = xsk_net_exit, 2044 }; 2045 2046 static int __init xsk_init(void) 2047 { 2048 int err; 2049 2050 err = proto_register(&xsk_proto, 0 /* no slab */); 2051 if (err) 2052 goto out; 2053 2054 err = sock_register(&xsk_family_ops); 2055 if (err) 2056 goto out_proto; 2057 2058 err = register_pernet_subsys(&xsk_net_ops); 2059 if (err) 2060 goto out_sk; 2061 2062 err = register_netdevice_notifier(&xsk_netdev_notifier); 2063 if (err) 2064 goto out_pernet; 2065 2066 xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", 2067 sizeof(struct xsk_addrs), 2068 0, SLAB_HWCACHE_ALIGN, NULL); 2069 if (!xsk_tx_generic_cache) { 2070 err = -ENOMEM; 2071 goto out_unreg_notif; 2072 } 2073 2074 return 0; 2075 2076 out_unreg_notif: 2077 unregister_netdevice_notifier(&xsk_netdev_notifier); 2078 out_pernet: 2079 unregister_pernet_subsys(&xsk_net_ops); 2080 out_sk: 2081 sock_unregister(PF_XDP); 2082 out_proto: 2083 proto_unregister(&xsk_proto); 2084 out: 2085 return err; 2086 } 2087 2088 fs_initcall(xsk_init); 2089