1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <linux/vmalloc.h> 26 27 #include <net/netdev_queues.h> 28 #include <net/xdp_sock_drv.h> 29 #include <net/busy_poll.h> 30 #include <net/netdev_lock.h> 31 #include <net/netdev_rx_queue.h> 32 #include <net/xdp.h> 33 34 #include "../core/dev.h" 35 36 #include "xsk_queue.h" 37 #include "xdp_umem.h" 38 #include "xsk.h" 39 40 #define TX_BATCH_SIZE 32 41 #define MAX_PER_SOCKET_BUDGET 32 42 43 struct xsk_addrs { 44 u32 num_descs; 45 u64 addrs[MAX_SKB_FRAGS + 1]; 46 }; 47 48 static struct kmem_cache *xsk_tx_generic_cache; 49 50 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 51 { 52 if (pool->cached_need_wakeup & XDP_WAKEUP_RX) 53 return; 54 55 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 56 pool->cached_need_wakeup |= XDP_WAKEUP_RX; 57 } 58 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 59 60 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) 61 { 62 struct xdp_sock *xs; 63 64 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 65 return; 66 67 rcu_read_lock(); 68 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 69 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 70 } 71 rcu_read_unlock(); 72 73 pool->cached_need_wakeup |= XDP_WAKEUP_TX; 74 } 75 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 76 77 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) 78 { 79 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) 80 return; 81 82 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 83 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; 84 } 85 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 86 87 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) 88 { 89 struct xdp_sock *xs; 90 91 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) 92 return; 93 94 rcu_read_lock(); 95 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 96 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 97 } 98 rcu_read_unlock(); 99 100 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; 101 } 102 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 103 104 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) 105 { 106 return pool->uses_need_wakeup; 107 } 108 EXPORT_SYMBOL(xsk_uses_need_wakeup); 109 110 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 111 u16 queue_id) 112 { 113 if (queue_id < dev->real_num_rx_queues) 114 return dev->_rx[queue_id].pool; 115 if (queue_id < dev->real_num_tx_queues) 116 return dev->_tx[queue_id].pool; 117 118 return NULL; 119 } 120 EXPORT_SYMBOL(xsk_get_pool_from_qid); 121 122 static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 123 { 124 if (queue_id < dev->num_rx_queues) 125 dev->_rx[queue_id].pool = NULL; 126 if (queue_id < dev->num_tx_queues) 127 dev->_tx[queue_id].pool = NULL; 128 } 129 130 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 131 { 132 struct netdev_rx_queue *hw_rxq; 133 134 if (!netif_rxq_is_leased(dev, queue_id)) 135 return __xsk_clear_pool_at_qid(dev, queue_id); 136 WARN_ON_ONCE(!netif_is_queue_leasee(dev)); 137 138 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 139 140 netdev_lock(hw_rxq->dev); 141 queue_id = get_netdev_rx_queue_index(hw_rxq); 142 __xsk_clear_pool_at_qid(hw_rxq->dev, queue_id); 143 netdev_unlock(hw_rxq->dev); 144 } 145 146 static int __xsk_reg_pool_at_qid(struct net_device *dev, 147 struct xsk_buff_pool *pool, u16 queue_id) 148 { 149 if (xsk_get_pool_from_qid(dev, queue_id)) 150 return -EBUSY; 151 152 if (queue_id < dev->real_num_rx_queues) 153 dev->_rx[queue_id].pool = pool; 154 if (queue_id < dev->real_num_tx_queues) 155 dev->_tx[queue_id].pool = pool; 156 157 return 0; 158 } 159 160 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do 161 * not know if the device has more tx queues than rx, or the opposite. 162 * This might also change during run time. 163 */ 164 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 165 u16 queue_id) 166 { 167 struct netdev_rx_queue *hw_rxq; 168 int ret; 169 170 if (queue_id >= max(dev->real_num_rx_queues, 171 dev->real_num_tx_queues)) 172 return -EINVAL; 173 174 if (queue_id >= dev->real_num_rx_queues || 175 !netif_rxq_is_leased(dev, queue_id)) 176 return __xsk_reg_pool_at_qid(dev, pool, queue_id); 177 if (!netif_is_queue_leasee(dev)) 178 return -EBUSY; 179 180 hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; 181 182 netdev_lock(hw_rxq->dev); 183 queue_id = get_netdev_rx_queue_index(hw_rxq); 184 ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id); 185 netdev_unlock(hw_rxq->dev); 186 187 return ret; 188 } 189 190 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, 191 u32 flags) 192 { 193 u64 addr; 194 int err; 195 196 addr = xp_get_handle(xskb, xskb->pool); 197 err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 198 if (err) { 199 xs->rx_queue_full++; 200 return err; 201 } 202 203 xp_release(xskb); 204 return 0; 205 } 206 207 static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, 208 u32 len, u32 flags) 209 { 210 u64 addr; 211 212 addr = xp_get_handle(xskb, xskb->pool); 213 __xskq_prod_reserve_desc(xs->rx, addr, len, flags); 214 215 xp_release(xskb); 216 } 217 218 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 219 { 220 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 221 u32 frags = xdp_buff_has_frags(xdp); 222 struct xdp_buff_xsk *pos, *tmp; 223 struct list_head *xskb_list; 224 u32 contd = 0; 225 u32 num_desc; 226 int err; 227 228 if (likely(!frags)) { 229 err = __xsk_rcv_zc(xs, xskb, len, contd); 230 if (err) 231 goto err; 232 return 0; 233 } 234 235 contd = XDP_PKT_CONTD; 236 num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; 237 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 238 xs->rx_queue_full++; 239 err = -ENOBUFS; 240 goto err; 241 } 242 243 __xsk_rcv_zc_safe(xs, xskb, len, contd); 244 xskb_list = &xskb->pool->xskb_list; 245 list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 246 if (list_is_singular(xskb_list)) 247 contd = 0; 248 len = pos->xdp.data_end - pos->xdp.data; 249 __xsk_rcv_zc_safe(xs, pos, len, contd); 250 list_del_init(&pos->list_node); 251 } 252 253 return 0; 254 err: 255 xsk_buff_free(xdp); 256 return err; 257 } 258 259 static void *xsk_copy_xdp_start(struct xdp_buff *from) 260 { 261 if (unlikely(xdp_data_meta_unsupported(from))) 262 return from->data; 263 else 264 return from->data_meta; 265 } 266 267 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, 268 u32 *from_len, skb_frag_t **frag, u32 rem) 269 { 270 u32 copied = 0; 271 272 while (1) { 273 u32 copy_len = min_t(u32, *from_len, to_len); 274 275 memcpy(to, *from, copy_len); 276 copied += copy_len; 277 if (rem == copied) 278 return copied; 279 280 if (*from_len == copy_len) { 281 *from = skb_frag_address(*frag); 282 *from_len = skb_frag_size((*frag)++); 283 } else { 284 *from += copy_len; 285 *from_len -= copy_len; 286 } 287 if (to_len == copy_len) 288 return copied; 289 290 to_len -= copy_len; 291 to += copy_len; 292 } 293 } 294 295 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 296 { 297 u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); 298 void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; 299 u32 from_len, meta_len, rem, num_desc; 300 struct xdp_buff_xsk *xskb; 301 struct xdp_buff *xsk_xdp; 302 skb_frag_t *frag; 303 304 from_len = xdp->data_end - copy_from; 305 meta_len = xdp->data - copy_from; 306 rem = len + meta_len; 307 308 if (len <= frame_size && !xdp_buff_has_frags(xdp)) { 309 int err; 310 311 xsk_xdp = xsk_buff_alloc(xs->pool); 312 if (!xsk_xdp) { 313 xs->rx_dropped++; 314 return -ENOMEM; 315 } 316 memcpy(xsk_xdp->data - meta_len, copy_from, rem); 317 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 318 err = __xsk_rcv_zc(xs, xskb, len, 0); 319 if (err) { 320 xsk_buff_free(xsk_xdp); 321 return err; 322 } 323 324 return 0; 325 } 326 327 num_desc = (len - 1) / frame_size + 1; 328 329 if (!xsk_buff_can_alloc(xs->pool, num_desc)) { 330 xs->rx_dropped++; 331 return -ENOMEM; 332 } 333 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 334 xs->rx_queue_full++; 335 return -ENOBUFS; 336 } 337 338 if (xdp_buff_has_frags(xdp)) { 339 struct skb_shared_info *sinfo; 340 341 sinfo = xdp_get_shared_info_from_buff(xdp); 342 frag = &sinfo->frags[0]; 343 } 344 345 do { 346 u32 to_len = frame_size + meta_len; 347 u32 copied; 348 349 xsk_xdp = xsk_buff_alloc(xs->pool); 350 copy_to = xsk_xdp->data - meta_len; 351 352 copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); 353 rem -= copied; 354 355 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 356 __xsk_rcv_zc_safe(xs, xskb, copied - meta_len, 357 rem ? XDP_PKT_CONTD : 0); 358 meta_len = 0; 359 } while (rem); 360 361 return 0; 362 } 363 364 static bool xsk_tx_writeable(struct xdp_sock *xs) 365 { 366 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) 367 return false; 368 369 return true; 370 } 371 372 static void __xsk_tx_release(struct xdp_sock *xs) 373 { 374 __xskq_cons_release(xs->tx); 375 if (xsk_tx_writeable(xs)) 376 xs->sk.sk_write_space(&xs->sk); 377 } 378 379 static bool xsk_is_bound(struct xdp_sock *xs) 380 { 381 if (READ_ONCE(xs->state) == XSK_BOUND) { 382 /* Matches smp_wmb() in bind(). */ 383 smp_rmb(); 384 return true; 385 } 386 return false; 387 } 388 389 static bool xsk_dev_queue_valid(const struct xdp_sock *xs, 390 const struct xdp_rxq_info *info) 391 { 392 struct net_device *dev = xs->dev; 393 u32 queue_index = xs->queue_id; 394 struct netdev_rx_queue *rxq; 395 396 if (info->dev == dev && 397 info->queue_index == queue_index) 398 return true; 399 400 if (queue_index < dev->real_num_rx_queues) { 401 rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); 402 if (!rxq) 403 return false; 404 405 dev = rxq->dev; 406 queue_index = get_netdev_rx_queue_index(rxq); 407 408 return info->dev == dev && 409 info->queue_index == queue_index; 410 } 411 return false; 412 } 413 414 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 415 { 416 if (!xsk_is_bound(xs)) 417 return -ENXIO; 418 if (!xsk_dev_queue_valid(xs, xdp->rxq)) 419 return -EINVAL; 420 421 if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 422 xs->rx_dropped++; 423 return -ENOSPC; 424 } 425 426 return 0; 427 } 428 429 static void xsk_flush(struct xdp_sock *xs) 430 { 431 xskq_prod_submit(xs->rx); 432 __xskq_cons_release(xs->pool->fq); 433 sock_def_readable(&xs->sk); 434 } 435 436 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 437 { 438 u32 len = xdp_get_buff_len(xdp); 439 int err; 440 441 err = xsk_rcv_check(xs, xdp, len); 442 if (!err) { 443 spin_lock_bh(&xs->pool->rx_lock); 444 err = __xsk_rcv(xs, xdp, len); 445 xsk_flush(xs); 446 spin_unlock_bh(&xs->pool->rx_lock); 447 } 448 449 return err; 450 } 451 452 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 453 { 454 u32 len = xdp_get_buff_len(xdp); 455 int err; 456 457 err = xsk_rcv_check(xs, xdp, len); 458 if (err) 459 return err; 460 461 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { 462 len = xdp->data_end - xdp->data; 463 return xsk_rcv_zc(xs, xdp, len); 464 } 465 466 err = __xsk_rcv(xs, xdp, len); 467 if (!err) 468 xdp_return_buff(xdp); 469 return err; 470 } 471 472 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 473 { 474 int err; 475 476 err = xsk_rcv(xs, xdp); 477 if (err) 478 return err; 479 480 if (!xs->flush_node.prev) { 481 struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); 482 483 list_add(&xs->flush_node, flush_list); 484 } 485 486 return 0; 487 } 488 489 void __xsk_map_flush(struct list_head *flush_list) 490 { 491 struct xdp_sock *xs, *tmp; 492 493 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 494 xsk_flush(xs); 495 __list_del_clearprev(&xs->flush_node); 496 } 497 } 498 499 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) 500 { 501 xskq_prod_submit_n(pool->cq, nb_entries); 502 } 503 EXPORT_SYMBOL(xsk_tx_completed); 504 505 void xsk_tx_release(struct xsk_buff_pool *pool) 506 { 507 struct xdp_sock *xs; 508 509 rcu_read_lock(); 510 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 511 __xsk_tx_release(xs); 512 rcu_read_unlock(); 513 } 514 EXPORT_SYMBOL(xsk_tx_release); 515 516 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 517 { 518 bool budget_exhausted = false; 519 struct xdp_sock *xs; 520 521 rcu_read_lock(); 522 again: 523 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 524 if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { 525 budget_exhausted = true; 526 continue; 527 } 528 529 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { 530 if (xskq_has_descs(xs->tx)) 531 xskq_cons_release(xs->tx); 532 continue; 533 } 534 535 xs->tx_budget_spent++; 536 537 /* This is the backpressure mechanism for the Tx path. 538 * Reserve space in the completion queue and only proceed 539 * if there is space in it. This avoids having to implement 540 * any buffering in the Tx path. 541 */ 542 if (xskq_prod_reserve_addr(pool->cq, desc->addr)) 543 goto out; 544 545 xskq_cons_release(xs->tx); 546 rcu_read_unlock(); 547 return true; 548 } 549 550 if (budget_exhausted) { 551 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 552 xs->tx_budget_spent = 0; 553 554 budget_exhausted = false; 555 goto again; 556 } 557 558 out: 559 rcu_read_unlock(); 560 return false; 561 } 562 EXPORT_SYMBOL(xsk_tx_peek_desc); 563 564 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) 565 { 566 struct xdp_desc *descs = pool->tx_descs; 567 u32 nb_pkts = 0; 568 569 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 570 nb_pkts++; 571 572 xsk_tx_release(pool); 573 return nb_pkts; 574 } 575 576 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) 577 { 578 struct xdp_sock *xs; 579 580 rcu_read_lock(); 581 if (!list_is_singular(&pool->xsk_tx_list)) { 582 /* Fallback to the non-batched version */ 583 rcu_read_unlock(); 584 return xsk_tx_peek_release_fallback(pool, nb_pkts); 585 } 586 587 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 588 if (!xs) { 589 nb_pkts = 0; 590 goto out; 591 } 592 593 nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); 594 595 /* This is the backpressure mechanism for the Tx path. Try to 596 * reserve space in the completion queue for all packets, but 597 * if there are fewer slots available, just process that many 598 * packets. This avoids having to implement any buffering in 599 * the Tx path. 600 */ 601 nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); 602 if (!nb_pkts) 603 goto out; 604 605 nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); 606 if (!nb_pkts) { 607 xs->tx->queue_empty_descs++; 608 goto out; 609 } 610 611 __xskq_cons_release(xs->tx); 612 xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); 613 xs->sk.sk_write_space(&xs->sk); 614 615 out: 616 rcu_read_unlock(); 617 return nb_pkts; 618 } 619 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 620 621 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 622 { 623 struct net_device *dev = xs->dev; 624 625 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 626 } 627 628 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) 629 { 630 int ret; 631 632 spin_lock(&pool->cq->cq_cached_prod_lock); 633 ret = xskq_prod_reserve(pool->cq); 634 spin_unlock(&pool->cq->cq_cached_prod_lock); 635 636 return ret; 637 } 638 639 static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) 640 { 641 return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; 642 } 643 644 static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) 645 { 646 return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); 647 } 648 649 static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) 650 { 651 struct xsk_addrs *xsk_addr; 652 653 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 654 if (unlikely(!xsk_addr)) 655 return NULL; 656 657 xsk_addr->addrs[0] = addr; 658 skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 659 return xsk_addr; 660 } 661 662 static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) 663 { 664 struct xsk_addrs *xsk_addr; 665 666 if (!xsk_skb_destructor_is_addr(skb)) 667 return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 668 669 xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); 670 if (likely(xsk_addr)) 671 xsk_addr->num_descs = 1; 672 return xsk_addr; 673 } 674 675 static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) 676 { 677 if (IS_ENABLED(CONFIG_64BIT)) { 678 skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); 679 return 0; 680 } 681 682 if (unlikely(!__xsk_addrs_alloc(skb, addr))) 683 return -ENOMEM; 684 return 0; 685 } 686 687 static void xsk_inc_num_desc(struct sk_buff *skb) 688 { 689 struct xsk_addrs *xsk_addr; 690 691 if (!xsk_skb_destructor_is_addr(skb)) { 692 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 693 xsk_addr->num_descs++; 694 } 695 } 696 697 static u32 xsk_get_num_desc(struct sk_buff *skb) 698 { 699 struct xsk_addrs *xsk_addr; 700 701 if (xsk_skb_destructor_is_addr(skb)) 702 return 1; 703 704 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 705 706 return xsk_addr->num_descs; 707 } 708 709 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, 710 struct sk_buff *skb) 711 { 712 u32 num_descs = xsk_get_num_desc(skb); 713 struct xsk_addrs *xsk_addr; 714 u32 descs_processed = 0; 715 unsigned long flags; 716 u32 idx, i; 717 718 spin_lock_irqsave(&pool->cq_prod_lock, flags); 719 idx = xskq_get_prod(pool->cq); 720 721 if (unlikely(!xsk_skb_destructor_is_addr(skb))) { 722 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 723 724 for (i = 0; i < num_descs; i++) { 725 xskq_prod_write_addr(pool->cq, idx + descs_processed, 726 xsk_addr->addrs[i]); 727 descs_processed++; 728 } 729 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 730 } else { 731 xskq_prod_write_addr(pool->cq, idx, 732 xsk_skb_destructor_get_addr(skb)); 733 descs_processed++; 734 } 735 xskq_prod_submit_n(pool->cq, descs_processed); 736 spin_unlock_irqrestore(&pool->cq_prod_lock, flags); 737 } 738 739 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) 740 { 741 spin_lock(&pool->cq->cq_cached_prod_lock); 742 xskq_prod_cancel_n(pool->cq, n); 743 spin_unlock(&pool->cq->cq_cached_prod_lock); 744 } 745 746 INDIRECT_CALLABLE_SCOPE 747 void xsk_destruct_skb(struct sk_buff *skb) 748 { 749 struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; 750 751 if (compl->tx_timestamp) { 752 /* sw completion timestamp, not a real one */ 753 *compl->tx_timestamp = ktime_get_tai_fast_ns(); 754 } 755 756 xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); 757 sock_wfree(skb); 758 } 759 760 static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, 761 u64 addr) 762 { 763 int err; 764 765 err = xsk_skb_destructor_set_addr(skb, addr); 766 if (unlikely(err)) 767 return err; 768 769 skb->dev = xs->dev; 770 skb->priority = READ_ONCE(xs->sk.sk_priority); 771 skb->mark = READ_ONCE(xs->sk.sk_mark); 772 skb->destructor = xsk_destruct_skb; 773 return 0; 774 } 775 776 static void xsk_consume_skb(struct sk_buff *skb) 777 { 778 struct xdp_sock *xs = xdp_sk(skb->sk); 779 u32 num_descs = xsk_get_num_desc(skb); 780 struct xsk_addrs *xsk_addr; 781 782 if (unlikely(!xsk_skb_destructor_is_addr(skb))) { 783 xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 784 kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 785 } 786 787 skb->destructor = sock_wfree; 788 xsk_cq_cancel_locked(xs->pool, num_descs); 789 /* Free skb without triggering the perf drop trace */ 790 consume_skb(skb); 791 xs->skb = NULL; 792 } 793 794 static void xsk_drop_skb(struct sk_buff *skb) 795 { 796 xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); 797 xsk_consume_skb(skb); 798 } 799 800 static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, 801 struct xdp_desc *desc, struct xsk_buff_pool *pool, 802 u32 hr) 803 { 804 struct xsk_tx_metadata *meta = NULL; 805 806 if (unlikely(pool->tx_metadata_len == 0)) 807 return -EINVAL; 808 809 meta = buffer - pool->tx_metadata_len; 810 if (unlikely(!xsk_buff_valid_tx_metadata(meta))) 811 return -EINVAL; 812 813 if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { 814 if (unlikely(meta->request.csum_start + 815 meta->request.csum_offset + 816 sizeof(__sum16) > desc->len)) 817 return -EINVAL; 818 819 skb->csum_start = hr + meta->request.csum_start; 820 skb->csum_offset = meta->request.csum_offset; 821 skb->ip_summed = CHECKSUM_PARTIAL; 822 823 if (unlikely(pool->tx_sw_csum)) { 824 int err; 825 826 err = skb_checksum_help(skb); 827 if (err) 828 return err; 829 } 830 } 831 832 if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) 833 skb->skb_mstamp_ns = meta->request.launch_time; 834 xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); 835 836 return 0; 837 } 838 839 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, 840 struct xdp_desc *desc) 841 { 842 struct xsk_buff_pool *pool = xs->pool; 843 u32 hr, len, ts, offset, copy, copied; 844 struct sk_buff *skb = xs->skb; 845 struct page *page; 846 void *buffer; 847 int err, i; 848 u64 addr; 849 850 addr = desc->addr; 851 buffer = xsk_buff_raw_get_data(pool, addr); 852 853 if (!skb) { 854 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); 855 856 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); 857 if (unlikely(!skb)) 858 return ERR_PTR(err); 859 860 skb_reserve(skb, hr); 861 if (desc->options & XDP_TX_METADATA) { 862 err = xsk_skb_metadata(skb, buffer, desc, pool, hr); 863 if (unlikely(err)) { 864 kfree_skb(skb); 865 return ERR_PTR(err); 866 } 867 } 868 } else { 869 struct xsk_addrs *xsk_addr; 870 871 xsk_addr = xsk_addrs_alloc(skb); 872 if (!xsk_addr) 873 return ERR_PTR(-ENOMEM); 874 875 /* in case of -EOVERFLOW that could happen below, 876 * xsk_consume_skb() will release this node as whole skb 877 * would be dropped, which implies freeing all list elements 878 */ 879 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 880 } 881 882 len = desc->len; 883 ts = pool->unaligned ? len : pool->chunk_size; 884 885 offset = offset_in_page(buffer); 886 addr = buffer - pool->addrs; 887 888 for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { 889 if (unlikely(i >= MAX_SKB_FRAGS)) { 890 if (!xs->skb) 891 kfree_skb(skb); 892 return ERR_PTR(-EOVERFLOW); 893 } 894 895 page = pool->umem->pgs[addr >> PAGE_SHIFT]; 896 get_page(page); 897 898 copy = min_t(u32, PAGE_SIZE - offset, len - copied); 899 skb_fill_page_desc(skb, i, page, offset, copy); 900 901 copied += copy; 902 addr += copy; 903 offset = 0; 904 } 905 906 skb->len += len; 907 skb->data_len += len; 908 skb->truesize += ts; 909 910 refcount_add(ts, &xs->sk.sk_wmem_alloc); 911 912 return skb; 913 } 914 915 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, 916 struct xdp_desc *desc) 917 { 918 struct net_device *dev = xs->dev; 919 struct sk_buff *skb = xs->skb; 920 int err; 921 922 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { 923 skb = xsk_build_skb_zerocopy(xs, desc); 924 if (IS_ERR(skb)) { 925 err = PTR_ERR(skb); 926 skb = NULL; 927 goto free_err; 928 } 929 } else { 930 u32 hr, tr, len; 931 void *buffer; 932 933 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); 934 len = desc->len; 935 936 if (!skb) { 937 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); 938 tr = dev->needed_tailroom; 939 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); 940 if (unlikely(!skb)) 941 goto free_err; 942 943 skb_reserve(skb, hr); 944 skb_put(skb, len); 945 946 err = skb_store_bits(skb, 0, buffer, len); 947 if (unlikely(err)) 948 goto free_err; 949 950 if (desc->options & XDP_TX_METADATA) { 951 err = xsk_skb_metadata(skb, buffer, desc, 952 xs->pool, hr); 953 if (unlikely(err)) 954 goto free_err; 955 } 956 } else { 957 int nr_frags = skb_shinfo(skb)->nr_frags; 958 struct xsk_addrs *xsk_addr; 959 struct page *page; 960 u8 *vaddr; 961 962 xsk_addr = xsk_addrs_alloc(skb); 963 if (!xsk_addr) { 964 err = -ENOMEM; 965 goto free_err; 966 } 967 968 if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 969 err = -EOVERFLOW; 970 goto free_err; 971 } 972 973 page = alloc_page(xs->sk.sk_allocation); 974 if (unlikely(!page)) { 975 err = -EAGAIN; 976 goto free_err; 977 } 978 979 vaddr = kmap_local_page(page); 980 memcpy(vaddr, buffer, len); 981 kunmap_local(vaddr); 982 983 skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 984 refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 985 986 xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 987 } 988 } 989 990 if (!xs->skb) { 991 err = xsk_skb_init_misc(skb, xs, desc->addr); 992 if (unlikely(err)) 993 goto free_err; 994 } 995 xsk_inc_num_desc(skb); 996 997 return skb; 998 999 free_err: 1000 if (skb && !xs->skb) 1001 kfree_skb(skb); 1002 1003 if (err == -EOVERFLOW) { 1004 if (xs->skb) { 1005 /* Drop the packet */ 1006 xsk_inc_num_desc(xs->skb); 1007 xsk_drop_skb(xs->skb); 1008 } else { 1009 xsk_cq_cancel_locked(xs->pool, 1); 1010 xs->tx->invalid_descs++; 1011 } 1012 xskq_cons_release(xs->tx); 1013 } else { 1014 /* Let application retry */ 1015 xsk_cq_cancel_locked(xs->pool, 1); 1016 } 1017 1018 return ERR_PTR(err); 1019 } 1020 1021 static int __xsk_generic_xmit(struct sock *sk) 1022 { 1023 struct xdp_sock *xs = xdp_sk(sk); 1024 bool sent_frame = false; 1025 struct xdp_desc desc; 1026 struct sk_buff *skb; 1027 u32 max_batch; 1028 int err = 0; 1029 1030 mutex_lock(&xs->mutex); 1031 1032 /* Since we dropped the RCU read lock, the socket state might have changed. */ 1033 if (unlikely(!xsk_is_bound(xs))) { 1034 err = -ENXIO; 1035 goto out; 1036 } 1037 1038 if (xs->queue_id >= xs->dev->real_num_tx_queues) 1039 goto out; 1040 1041 max_batch = READ_ONCE(xs->max_tx_budget); 1042 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { 1043 if (max_batch-- == 0) { 1044 err = -EAGAIN; 1045 goto out; 1046 } 1047 1048 /* This is the backpressure mechanism for the Tx path. 1049 * Reserve space in the completion queue and only proceed 1050 * if there is space in it. This avoids having to implement 1051 * any buffering in the Tx path. 1052 */ 1053 err = xsk_cq_reserve_locked(xs->pool); 1054 if (err) { 1055 err = -EAGAIN; 1056 goto out; 1057 } 1058 1059 skb = xsk_build_skb(xs, &desc); 1060 if (IS_ERR(skb)) { 1061 err = PTR_ERR(skb); 1062 if (err != -EOVERFLOW) 1063 goto out; 1064 err = 0; 1065 continue; 1066 } 1067 1068 xskq_cons_release(xs->tx); 1069 1070 if (xp_mb_desc(&desc)) { 1071 xs->skb = skb; 1072 continue; 1073 } 1074 1075 err = __dev_direct_xmit(skb, xs->queue_id); 1076 if (err == NETDEV_TX_BUSY) { 1077 /* Tell user-space to retry the send */ 1078 xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); 1079 xsk_consume_skb(skb); 1080 err = -EAGAIN; 1081 goto out; 1082 } 1083 1084 /* Ignore NET_XMIT_CN as packet might have been sent */ 1085 if (err == NET_XMIT_DROP) { 1086 /* SKB completed but not sent */ 1087 err = -EBUSY; 1088 xs->skb = NULL; 1089 goto out; 1090 } 1091 1092 sent_frame = true; 1093 xs->skb = NULL; 1094 } 1095 1096 if (xskq_has_descs(xs->tx)) { 1097 if (xs->skb) 1098 xsk_drop_skb(xs->skb); 1099 xskq_cons_release(xs->tx); 1100 } 1101 1102 out: 1103 if (sent_frame) 1104 __xsk_tx_release(xs); 1105 1106 mutex_unlock(&xs->mutex); 1107 return err; 1108 } 1109 1110 static int xsk_generic_xmit(struct sock *sk) 1111 { 1112 int ret; 1113 1114 /* Drop the RCU lock since the SKB path might sleep. */ 1115 rcu_read_unlock(); 1116 ret = __xsk_generic_xmit(sk); 1117 /* Reaquire RCU lock before going into common code. */ 1118 rcu_read_lock(); 1119 1120 return ret; 1121 } 1122 1123 static bool xsk_no_wakeup(struct sock *sk) 1124 { 1125 #ifdef CONFIG_NET_RX_BUSY_POLL 1126 /* Prefer busy-polling, skip the wakeup. */ 1127 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 1128 napi_id_valid(READ_ONCE(sk->sk_napi_id)); 1129 #else 1130 return false; 1131 #endif 1132 } 1133 1134 static int xsk_check_common(struct xdp_sock *xs) 1135 { 1136 if (unlikely(!xsk_is_bound(xs))) 1137 return -ENXIO; 1138 if (unlikely(!(xs->dev->flags & IFF_UP))) 1139 return -ENETDOWN; 1140 1141 return 0; 1142 } 1143 1144 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1145 { 1146 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 1147 struct sock *sk = sock->sk; 1148 struct xdp_sock *xs = xdp_sk(sk); 1149 struct xsk_buff_pool *pool; 1150 int err; 1151 1152 err = xsk_check_common(xs); 1153 if (err) 1154 return err; 1155 if (unlikely(need_wait)) 1156 return -EOPNOTSUPP; 1157 if (unlikely(!xs->tx)) 1158 return -ENOBUFS; 1159 1160 if (sk_can_busy_loop(sk)) 1161 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1162 1163 if (xs->zc && xsk_no_wakeup(sk)) 1164 return 0; 1165 1166 pool = xs->pool; 1167 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { 1168 if (xs->zc) 1169 return xsk_wakeup(xs, XDP_WAKEUP_TX); 1170 return xsk_generic_xmit(sk); 1171 } 1172 return 0; 1173 } 1174 1175 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1176 { 1177 int ret; 1178 1179 rcu_read_lock(); 1180 ret = __xsk_sendmsg(sock, m, total_len); 1181 rcu_read_unlock(); 1182 1183 return ret; 1184 } 1185 1186 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1187 { 1188 bool need_wait = !(flags & MSG_DONTWAIT); 1189 struct sock *sk = sock->sk; 1190 struct xdp_sock *xs = xdp_sk(sk); 1191 int err; 1192 1193 err = xsk_check_common(xs); 1194 if (err) 1195 return err; 1196 if (unlikely(!xs->rx)) 1197 return -ENOBUFS; 1198 if (unlikely(need_wait)) 1199 return -EOPNOTSUPP; 1200 1201 if (sk_can_busy_loop(sk)) 1202 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1203 1204 if (xsk_no_wakeup(sk)) 1205 return 0; 1206 1207 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 1208 return xsk_wakeup(xs, XDP_WAKEUP_RX); 1209 return 0; 1210 } 1211 1212 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1213 { 1214 int ret; 1215 1216 rcu_read_lock(); 1217 ret = __xsk_recvmsg(sock, m, len, flags); 1218 rcu_read_unlock(); 1219 1220 return ret; 1221 } 1222 1223 static __poll_t xsk_poll(struct file *file, struct socket *sock, 1224 struct poll_table_struct *wait) 1225 { 1226 __poll_t mask = 0; 1227 struct sock *sk = sock->sk; 1228 struct xdp_sock *xs = xdp_sk(sk); 1229 struct xsk_buff_pool *pool; 1230 1231 sock_poll_wait(file, sock, wait); 1232 1233 rcu_read_lock(); 1234 if (xsk_check_common(xs)) 1235 goto out; 1236 1237 pool = xs->pool; 1238 1239 if (pool->cached_need_wakeup) { 1240 if (xs->zc) 1241 xsk_wakeup(xs, pool->cached_need_wakeup); 1242 else if (xs->tx) 1243 /* Poll needs to drive Tx also in copy mode */ 1244 xsk_generic_xmit(sk); 1245 } 1246 1247 if (xs->rx && !xskq_prod_is_empty(xs->rx)) 1248 mask |= EPOLLIN | EPOLLRDNORM; 1249 if (xs->tx && xsk_tx_writeable(xs)) 1250 mask |= EPOLLOUT | EPOLLWRNORM; 1251 out: 1252 rcu_read_unlock(); 1253 return mask; 1254 } 1255 1256 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 1257 bool umem_queue) 1258 { 1259 struct xsk_queue *q; 1260 1261 if (entries == 0 || *queue || !is_power_of_2(entries)) 1262 return -EINVAL; 1263 1264 q = xskq_create(entries, umem_queue); 1265 if (!q) 1266 return -ENOMEM; 1267 1268 /* Make sure queue is ready before it can be seen by others */ 1269 smp_wmb(); 1270 WRITE_ONCE(*queue, q); 1271 return 0; 1272 } 1273 1274 static void xsk_unbind_dev(struct xdp_sock *xs) 1275 { 1276 struct net_device *dev = xs->dev; 1277 1278 if (xs->state != XSK_BOUND) 1279 return; 1280 WRITE_ONCE(xs->state, XSK_UNBOUND); 1281 1282 /* Wait for driver to stop using the xdp socket. */ 1283 xp_del_xsk(xs->pool, xs); 1284 synchronize_net(); 1285 dev_put(dev); 1286 } 1287 1288 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 1289 struct xdp_sock __rcu ***map_entry) 1290 { 1291 struct xsk_map *map = NULL; 1292 struct xsk_map_node *node; 1293 1294 *map_entry = NULL; 1295 1296 spin_lock_bh(&xs->map_list_lock); 1297 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 1298 node); 1299 if (node) { 1300 bpf_map_inc(&node->map->map); 1301 map = node->map; 1302 *map_entry = node->map_entry; 1303 } 1304 spin_unlock_bh(&xs->map_list_lock); 1305 return map; 1306 } 1307 1308 static void xsk_delete_from_maps(struct xdp_sock *xs) 1309 { 1310 /* This function removes the current XDP socket from all the 1311 * maps it resides in. We need to take extra care here, due to 1312 * the two locks involved. Each map has a lock synchronizing 1313 * updates to the entries, and each socket has a lock that 1314 * synchronizes access to the list of maps (map_list). For 1315 * deadlock avoidance the locks need to be taken in the order 1316 * "map lock"->"socket map list lock". We start off by 1317 * accessing the socket map list, and take a reference to the 1318 * map to guarantee existence between the 1319 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 1320 * calls. Then we ask the map to remove the socket, which 1321 * tries to remove the socket from the map. Note that there 1322 * might be updates to the map between 1323 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 1324 */ 1325 struct xdp_sock __rcu **map_entry = NULL; 1326 struct xsk_map *map; 1327 1328 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 1329 xsk_map_try_sock_delete(map, xs, map_entry); 1330 bpf_map_put(&map->map); 1331 } 1332 } 1333 1334 static int xsk_release(struct socket *sock) 1335 { 1336 struct sock *sk = sock->sk; 1337 struct xdp_sock *xs = xdp_sk(sk); 1338 struct net *net; 1339 1340 if (!sk) 1341 return 0; 1342 1343 net = sock_net(sk); 1344 1345 if (xs->skb) 1346 xsk_drop_skb(xs->skb); 1347 1348 mutex_lock(&net->xdp.lock); 1349 sk_del_node_init_rcu(sk); 1350 mutex_unlock(&net->xdp.lock); 1351 1352 sock_prot_inuse_add(net, sk->sk_prot, -1); 1353 1354 xsk_delete_from_maps(xs); 1355 mutex_lock(&xs->mutex); 1356 xsk_unbind_dev(xs); 1357 mutex_unlock(&xs->mutex); 1358 1359 xskq_destroy(xs->rx); 1360 xskq_destroy(xs->tx); 1361 xskq_destroy(xs->fq_tmp); 1362 xskq_destroy(xs->cq_tmp); 1363 1364 sock_orphan(sk); 1365 sock->sk = NULL; 1366 1367 sock_put(sk); 1368 1369 return 0; 1370 } 1371 1372 static struct socket *xsk_lookup_xsk_from_fd(int fd) 1373 { 1374 struct socket *sock; 1375 int err; 1376 1377 sock = sockfd_lookup(fd, &err); 1378 if (!sock) 1379 return ERR_PTR(-ENOTSOCK); 1380 1381 if (sock->sk->sk_family != PF_XDP) { 1382 sockfd_put(sock); 1383 return ERR_PTR(-ENOPROTOOPT); 1384 } 1385 1386 return sock; 1387 } 1388 1389 static bool xsk_validate_queues(struct xdp_sock *xs) 1390 { 1391 return xs->fq_tmp && xs->cq_tmp; 1392 } 1393 1394 static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) 1395 { 1396 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 1397 struct sock *sk = sock->sk; 1398 struct xdp_sock *xs = xdp_sk(sk); 1399 struct net_device *dev; 1400 int bound_dev_if; 1401 u32 flags, qid; 1402 int err = 0; 1403 1404 if (addr_len < sizeof(struct sockaddr_xdp)) 1405 return -EINVAL; 1406 if (sxdp->sxdp_family != AF_XDP) 1407 return -EINVAL; 1408 1409 flags = sxdp->sxdp_flags; 1410 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 1411 XDP_USE_NEED_WAKEUP | XDP_USE_SG)) 1412 return -EINVAL; 1413 1414 bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 1415 if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) 1416 return -EINVAL; 1417 1418 rtnl_lock(); 1419 mutex_lock(&xs->mutex); 1420 if (xs->state != XSK_READY) { 1421 err = -EBUSY; 1422 goto out_release; 1423 } 1424 1425 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 1426 if (!dev) { 1427 err = -ENODEV; 1428 goto out_release; 1429 } 1430 1431 netdev_lock_ops(dev); 1432 1433 if (!xs->rx && !xs->tx) { 1434 err = -EINVAL; 1435 goto out_unlock; 1436 } 1437 1438 qid = sxdp->sxdp_queue_id; 1439 1440 if (flags & XDP_SHARED_UMEM) { 1441 struct xdp_sock *umem_xs; 1442 struct socket *sock; 1443 1444 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 1445 (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { 1446 /* Cannot specify flags for shared sockets. */ 1447 err = -EINVAL; 1448 goto out_unlock; 1449 } 1450 1451 if (xs->umem) { 1452 /* We have already our own. */ 1453 err = -EINVAL; 1454 goto out_unlock; 1455 } 1456 1457 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 1458 if (IS_ERR(sock)) { 1459 err = PTR_ERR(sock); 1460 goto out_unlock; 1461 } 1462 1463 umem_xs = xdp_sk(sock->sk); 1464 if (!xsk_is_bound(umem_xs)) { 1465 err = -EBADF; 1466 sockfd_put(sock); 1467 goto out_unlock; 1468 } 1469 1470 if (umem_xs->queue_id != qid || umem_xs->dev != dev) { 1471 /* One fill and completion ring required for each queue id. */ 1472 if (!xsk_validate_queues(xs)) { 1473 err = -EINVAL; 1474 sockfd_put(sock); 1475 goto out_unlock; 1476 } 1477 1478 /* Share the umem with another socket on another qid 1479 * and/or device. 1480 */ 1481 xs->pool = xp_create_and_assign_umem(xs, 1482 umem_xs->umem); 1483 if (!xs->pool) { 1484 err = -ENOMEM; 1485 sockfd_put(sock); 1486 goto out_unlock; 1487 } 1488 1489 err = xp_assign_dev_shared(xs->pool, umem_xs, dev, 1490 qid); 1491 if (err) { 1492 xp_destroy(xs->pool); 1493 xs->pool = NULL; 1494 sockfd_put(sock); 1495 goto out_unlock; 1496 } 1497 } else { 1498 /* Share the buffer pool with the other socket. */ 1499 if (xs->fq_tmp || xs->cq_tmp) { 1500 /* Do not allow setting your own fq or cq. */ 1501 err = -EINVAL; 1502 sockfd_put(sock); 1503 goto out_unlock; 1504 } 1505 1506 xp_get_pool(umem_xs->pool); 1507 xs->pool = umem_xs->pool; 1508 1509 /* If underlying shared umem was created without Tx 1510 * ring, allocate Tx descs array that Tx batching API 1511 * utilizes 1512 */ 1513 if (xs->tx && !xs->pool->tx_descs) { 1514 err = xp_alloc_tx_descs(xs->pool, xs); 1515 if (err) { 1516 xp_put_pool(xs->pool); 1517 xs->pool = NULL; 1518 sockfd_put(sock); 1519 goto out_unlock; 1520 } 1521 } 1522 } 1523 1524 xdp_get_umem(umem_xs->umem); 1525 WRITE_ONCE(xs->umem, umem_xs->umem); 1526 sockfd_put(sock); 1527 } else if (!xs->umem || !xsk_validate_queues(xs)) { 1528 err = -EINVAL; 1529 goto out_unlock; 1530 } else { 1531 /* This xsk has its own umem. */ 1532 xs->pool = xp_create_and_assign_umem(xs, xs->umem); 1533 if (!xs->pool) { 1534 err = -ENOMEM; 1535 goto out_unlock; 1536 } 1537 1538 err = xp_assign_dev(xs->pool, dev, qid, flags); 1539 if (err) { 1540 xp_destroy(xs->pool); 1541 xs->pool = NULL; 1542 goto out_unlock; 1543 } 1544 } 1545 1546 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 1547 xs->fq_tmp = NULL; 1548 xs->cq_tmp = NULL; 1549 1550 xs->dev = dev; 1551 xs->zc = xs->umem->zc; 1552 xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); 1553 xs->queue_id = qid; 1554 xp_add_xsk(xs->pool, xs); 1555 1556 if (qid < dev->real_num_rx_queues) { 1557 struct netdev_rx_queue *rxq; 1558 1559 rxq = __netif_get_rx_queue(dev, qid); 1560 if (rxq->napi) 1561 __sk_mark_napi_id_once(sk, rxq->napi->napi_id); 1562 } 1563 1564 out_unlock: 1565 if (err) { 1566 dev_put(dev); 1567 } else { 1568 /* Matches smp_rmb() in bind() for shared umem 1569 * sockets, and xsk_is_bound(). 1570 */ 1571 smp_wmb(); 1572 WRITE_ONCE(xs->state, XSK_BOUND); 1573 } 1574 netdev_unlock_ops(dev); 1575 out_release: 1576 mutex_unlock(&xs->mutex); 1577 rtnl_unlock(); 1578 return err; 1579 } 1580 1581 struct xdp_umem_reg_v1 { 1582 __u64 addr; /* Start of packet data area */ 1583 __u64 len; /* Length of packet data area */ 1584 __u32 chunk_size; 1585 __u32 headroom; 1586 }; 1587 1588 static int xsk_setsockopt(struct socket *sock, int level, int optname, 1589 sockptr_t optval, unsigned int optlen) 1590 { 1591 struct sock *sk = sock->sk; 1592 struct xdp_sock *xs = xdp_sk(sk); 1593 int err; 1594 1595 if (level != SOL_XDP) 1596 return -ENOPROTOOPT; 1597 1598 switch (optname) { 1599 case XDP_RX_RING: 1600 case XDP_TX_RING: 1601 { 1602 struct xsk_queue **q; 1603 int entries; 1604 1605 if (optlen < sizeof(entries)) 1606 return -EINVAL; 1607 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1608 return -EFAULT; 1609 1610 mutex_lock(&xs->mutex); 1611 if (xs->state != XSK_READY) { 1612 mutex_unlock(&xs->mutex); 1613 return -EBUSY; 1614 } 1615 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 1616 err = xsk_init_queue(entries, q, false); 1617 if (!err && optname == XDP_TX_RING) 1618 /* Tx needs to be explicitly woken up the first time */ 1619 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 1620 mutex_unlock(&xs->mutex); 1621 return err; 1622 } 1623 case XDP_UMEM_REG: 1624 { 1625 size_t mr_size = sizeof(struct xdp_umem_reg); 1626 struct xdp_umem_reg mr = {}; 1627 struct xdp_umem *umem; 1628 1629 if (optlen < sizeof(struct xdp_umem_reg_v1)) 1630 return -EINVAL; 1631 else if (optlen < sizeof(mr)) 1632 mr_size = sizeof(struct xdp_umem_reg_v1); 1633 1634 BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); 1635 1636 /* Make sure the last field of the struct doesn't have 1637 * uninitialized padding. All padding has to be explicit 1638 * and has to be set to zero by the userspace to make 1639 * struct xdp_umem_reg extensible in the future. 1640 */ 1641 BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + 1642 sizeof_field(struct xdp_umem_reg, tx_metadata_len) != 1643 sizeof(struct xdp_umem_reg)); 1644 1645 if (copy_from_sockptr(&mr, optval, mr_size)) 1646 return -EFAULT; 1647 1648 mutex_lock(&xs->mutex); 1649 if (xs->state != XSK_READY || xs->umem) { 1650 mutex_unlock(&xs->mutex); 1651 return -EBUSY; 1652 } 1653 1654 umem = xdp_umem_create(&mr); 1655 if (IS_ERR(umem)) { 1656 mutex_unlock(&xs->mutex); 1657 return PTR_ERR(umem); 1658 } 1659 1660 /* Make sure umem is ready before it can be seen by others */ 1661 smp_wmb(); 1662 WRITE_ONCE(xs->umem, umem); 1663 mutex_unlock(&xs->mutex); 1664 return 0; 1665 } 1666 case XDP_UMEM_FILL_RING: 1667 case XDP_UMEM_COMPLETION_RING: 1668 { 1669 struct xsk_queue **q; 1670 int entries; 1671 1672 if (optlen < sizeof(entries)) 1673 return -EINVAL; 1674 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1675 return -EFAULT; 1676 1677 mutex_lock(&xs->mutex); 1678 if (xs->state != XSK_READY) { 1679 mutex_unlock(&xs->mutex); 1680 return -EBUSY; 1681 } 1682 1683 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : 1684 &xs->cq_tmp; 1685 err = xsk_init_queue(entries, q, true); 1686 mutex_unlock(&xs->mutex); 1687 return err; 1688 } 1689 case XDP_MAX_TX_SKB_BUDGET: 1690 { 1691 unsigned int budget; 1692 1693 if (optlen != sizeof(budget)) 1694 return -EINVAL; 1695 if (copy_from_sockptr(&budget, optval, sizeof(budget))) 1696 return -EFAULT; 1697 if (!xs->tx || 1698 budget < TX_BATCH_SIZE || budget > xs->tx->nentries) 1699 return -EACCES; 1700 1701 WRITE_ONCE(xs->max_tx_budget, budget); 1702 return 0; 1703 } 1704 default: 1705 break; 1706 } 1707 1708 return -ENOPROTOOPT; 1709 } 1710 1711 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 1712 { 1713 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 1714 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 1715 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 1716 } 1717 1718 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 1719 { 1720 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 1721 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 1722 ring->desc = offsetof(struct xdp_umem_ring, desc); 1723 } 1724 1725 struct xdp_statistics_v1 { 1726 __u64 rx_dropped; 1727 __u64 rx_invalid_descs; 1728 __u64 tx_invalid_descs; 1729 }; 1730 1731 static int xsk_getsockopt(struct socket *sock, int level, int optname, 1732 char __user *optval, int __user *optlen) 1733 { 1734 struct sock *sk = sock->sk; 1735 struct xdp_sock *xs = xdp_sk(sk); 1736 int len; 1737 1738 if (level != SOL_XDP) 1739 return -ENOPROTOOPT; 1740 1741 if (get_user(len, optlen)) 1742 return -EFAULT; 1743 if (len < 0) 1744 return -EINVAL; 1745 1746 switch (optname) { 1747 case XDP_STATISTICS: 1748 { 1749 struct xdp_statistics stats = {}; 1750 bool extra_stats = true; 1751 size_t stats_size; 1752 1753 if (len < sizeof(struct xdp_statistics_v1)) { 1754 return -EINVAL; 1755 } else if (len < sizeof(stats)) { 1756 extra_stats = false; 1757 stats_size = sizeof(struct xdp_statistics_v1); 1758 } else { 1759 stats_size = sizeof(stats); 1760 } 1761 1762 mutex_lock(&xs->mutex); 1763 stats.rx_dropped = xs->rx_dropped; 1764 if (extra_stats) { 1765 stats.rx_ring_full = xs->rx_queue_full; 1766 stats.rx_fill_ring_empty_descs = 1767 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; 1768 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); 1769 } else { 1770 stats.rx_dropped += xs->rx_queue_full; 1771 } 1772 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 1773 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 1774 mutex_unlock(&xs->mutex); 1775 1776 if (copy_to_user(optval, &stats, stats_size)) 1777 return -EFAULT; 1778 if (put_user(stats_size, optlen)) 1779 return -EFAULT; 1780 1781 return 0; 1782 } 1783 case XDP_MMAP_OFFSETS: 1784 { 1785 struct xdp_mmap_offsets off; 1786 struct xdp_mmap_offsets_v1 off_v1; 1787 bool flags_supported = true; 1788 void *to_copy; 1789 1790 if (len < sizeof(off_v1)) 1791 return -EINVAL; 1792 else if (len < sizeof(off)) 1793 flags_supported = false; 1794 1795 if (flags_supported) { 1796 /* xdp_ring_offset is identical to xdp_ring_offset_v1 1797 * except for the flags field added to the end. 1798 */ 1799 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1800 &off.rx); 1801 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1802 &off.tx); 1803 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1804 &off.fr); 1805 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1806 &off.cr); 1807 off.rx.flags = offsetof(struct xdp_rxtx_ring, 1808 ptrs.flags); 1809 off.tx.flags = offsetof(struct xdp_rxtx_ring, 1810 ptrs.flags); 1811 off.fr.flags = offsetof(struct xdp_umem_ring, 1812 ptrs.flags); 1813 off.cr.flags = offsetof(struct xdp_umem_ring, 1814 ptrs.flags); 1815 1816 len = sizeof(off); 1817 to_copy = &off; 1818 } else { 1819 xsk_enter_rxtx_offsets(&off_v1.rx); 1820 xsk_enter_rxtx_offsets(&off_v1.tx); 1821 xsk_enter_umem_offsets(&off_v1.fr); 1822 xsk_enter_umem_offsets(&off_v1.cr); 1823 1824 len = sizeof(off_v1); 1825 to_copy = &off_v1; 1826 } 1827 1828 if (copy_to_user(optval, to_copy, len)) 1829 return -EFAULT; 1830 if (put_user(len, optlen)) 1831 return -EFAULT; 1832 1833 return 0; 1834 } 1835 case XDP_OPTIONS: 1836 { 1837 struct xdp_options opts = {}; 1838 1839 if (len < sizeof(opts)) 1840 return -EINVAL; 1841 1842 mutex_lock(&xs->mutex); 1843 if (xs->zc) 1844 opts.flags |= XDP_OPTIONS_ZEROCOPY; 1845 mutex_unlock(&xs->mutex); 1846 1847 len = sizeof(opts); 1848 if (copy_to_user(optval, &opts, len)) 1849 return -EFAULT; 1850 if (put_user(len, optlen)) 1851 return -EFAULT; 1852 1853 return 0; 1854 } 1855 default: 1856 break; 1857 } 1858 1859 return -EOPNOTSUPP; 1860 } 1861 1862 static int xsk_mmap(struct file *file, struct socket *sock, 1863 struct vm_area_struct *vma) 1864 { 1865 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 1866 unsigned long size = vma->vm_end - vma->vm_start; 1867 struct xdp_sock *xs = xdp_sk(sock->sk); 1868 int state = READ_ONCE(xs->state); 1869 struct xsk_queue *q = NULL; 1870 1871 if (state != XSK_READY && state != XSK_BOUND) 1872 return -EBUSY; 1873 1874 if (offset == XDP_PGOFF_RX_RING) { 1875 q = READ_ONCE(xs->rx); 1876 } else if (offset == XDP_PGOFF_TX_RING) { 1877 q = READ_ONCE(xs->tx); 1878 } else { 1879 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1880 smp_rmb(); 1881 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1882 q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : 1883 READ_ONCE(xs->pool->fq); 1884 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1885 q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : 1886 READ_ONCE(xs->pool->cq); 1887 } 1888 1889 if (!q) 1890 return -EINVAL; 1891 1892 /* Matches the smp_wmb() in xsk_init_queue */ 1893 smp_rmb(); 1894 if (size > q->ring_vmalloc_size) 1895 return -EINVAL; 1896 1897 return remap_vmalloc_range(vma, q->ring, 0); 1898 } 1899 1900 static int xsk_notifier(struct notifier_block *this, 1901 unsigned long msg, void *ptr) 1902 { 1903 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1904 struct net *net = dev_net(dev); 1905 struct sock *sk; 1906 1907 switch (msg) { 1908 case NETDEV_UNREGISTER: 1909 mutex_lock(&net->xdp.lock); 1910 sk_for_each(sk, &net->xdp.list) { 1911 struct xdp_sock *xs = xdp_sk(sk); 1912 1913 mutex_lock(&xs->mutex); 1914 if (xs->dev == dev) { 1915 sk->sk_err = ENETDOWN; 1916 if (!sock_flag(sk, SOCK_DEAD)) 1917 sk_error_report(sk); 1918 1919 xsk_unbind_dev(xs); 1920 1921 /* Clear device references. */ 1922 xp_clear_dev(xs->pool); 1923 } 1924 mutex_unlock(&xs->mutex); 1925 } 1926 mutex_unlock(&net->xdp.lock); 1927 break; 1928 } 1929 return NOTIFY_DONE; 1930 } 1931 1932 static struct proto xsk_proto = { 1933 .name = "XDP", 1934 .owner = THIS_MODULE, 1935 .obj_size = sizeof(struct xdp_sock), 1936 }; 1937 1938 static const struct proto_ops xsk_proto_ops = { 1939 .family = PF_XDP, 1940 .owner = THIS_MODULE, 1941 .release = xsk_release, 1942 .bind = xsk_bind, 1943 .connect = sock_no_connect, 1944 .socketpair = sock_no_socketpair, 1945 .accept = sock_no_accept, 1946 .getname = sock_no_getname, 1947 .poll = xsk_poll, 1948 .ioctl = sock_no_ioctl, 1949 .listen = sock_no_listen, 1950 .shutdown = sock_no_shutdown, 1951 .setsockopt = xsk_setsockopt, 1952 .getsockopt = xsk_getsockopt, 1953 .sendmsg = xsk_sendmsg, 1954 .recvmsg = xsk_recvmsg, 1955 .mmap = xsk_mmap, 1956 }; 1957 1958 static void xsk_destruct(struct sock *sk) 1959 { 1960 struct xdp_sock *xs = xdp_sk(sk); 1961 1962 if (!sock_flag(sk, SOCK_DEAD)) 1963 return; 1964 1965 if (!xp_put_pool(xs->pool)) 1966 xdp_put_umem(xs->umem, !xs->pool); 1967 } 1968 1969 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1970 int kern) 1971 { 1972 struct xdp_sock *xs; 1973 struct sock *sk; 1974 1975 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1976 return -EPERM; 1977 if (sock->type != SOCK_RAW) 1978 return -ESOCKTNOSUPPORT; 1979 1980 if (protocol) 1981 return -EPROTONOSUPPORT; 1982 1983 sock->state = SS_UNCONNECTED; 1984 1985 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1986 if (!sk) 1987 return -ENOBUFS; 1988 1989 sock->ops = &xsk_proto_ops; 1990 1991 sock_init_data(sock, sk); 1992 1993 sk->sk_family = PF_XDP; 1994 1995 sk->sk_destruct = xsk_destruct; 1996 1997 sock_set_flag(sk, SOCK_RCU_FREE); 1998 1999 xs = xdp_sk(sk); 2000 xs->state = XSK_READY; 2001 xs->max_tx_budget = TX_BATCH_SIZE; 2002 mutex_init(&xs->mutex); 2003 2004 INIT_LIST_HEAD(&xs->map_list); 2005 spin_lock_init(&xs->map_list_lock); 2006 2007 mutex_lock(&net->xdp.lock); 2008 sk_add_node_rcu(sk, &net->xdp.list); 2009 mutex_unlock(&net->xdp.lock); 2010 2011 sock_prot_inuse_add(net, &xsk_proto, 1); 2012 2013 return 0; 2014 } 2015 2016 static const struct net_proto_family xsk_family_ops = { 2017 .family = PF_XDP, 2018 .create = xsk_create, 2019 .owner = THIS_MODULE, 2020 }; 2021 2022 static struct notifier_block xsk_netdev_notifier = { 2023 .notifier_call = xsk_notifier, 2024 }; 2025 2026 static int __net_init xsk_net_init(struct net *net) 2027 { 2028 mutex_init(&net->xdp.lock); 2029 INIT_HLIST_HEAD(&net->xdp.list); 2030 return 0; 2031 } 2032 2033 static void __net_exit xsk_net_exit(struct net *net) 2034 { 2035 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 2036 } 2037 2038 static struct pernet_operations xsk_net_ops = { 2039 .init = xsk_net_init, 2040 .exit = xsk_net_exit, 2041 }; 2042 2043 static int __init xsk_init(void) 2044 { 2045 int err; 2046 2047 err = proto_register(&xsk_proto, 0 /* no slab */); 2048 if (err) 2049 goto out; 2050 2051 err = sock_register(&xsk_family_ops); 2052 if (err) 2053 goto out_proto; 2054 2055 err = register_pernet_subsys(&xsk_net_ops); 2056 if (err) 2057 goto out_sk; 2058 2059 err = register_netdevice_notifier(&xsk_netdev_notifier); 2060 if (err) 2061 goto out_pernet; 2062 2063 xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", 2064 sizeof(struct xsk_addrs), 2065 0, SLAB_HWCACHE_ALIGN, NULL); 2066 if (!xsk_tx_generic_cache) { 2067 err = -ENOMEM; 2068 goto out_unreg_notif; 2069 } 2070 2071 return 0; 2072 2073 out_unreg_notif: 2074 unregister_netdevice_notifier(&xsk_netdev_notifier); 2075 out_pernet: 2076 unregister_pernet_subsys(&xsk_net_ops); 2077 out_sk: 2078 sock_unregister(PF_XDP); 2079 out_proto: 2080 proto_unregister(&xsk_proto); 2081 out: 2082 return err; 2083 } 2084 2085 fs_initcall(xsk_init); 2086