1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <linux/vmalloc.h> 26 #include <net/xdp_sock_drv.h> 27 #include <net/busy_poll.h> 28 #include <net/netdev_lock.h> 29 #include <net/netdev_rx_queue.h> 30 #include <net/xdp.h> 31 32 #include "xsk_queue.h" 33 #include "xdp_umem.h" 34 #include "xsk.h" 35 36 #define TX_BATCH_SIZE 32 37 #define MAX_PER_SOCKET_BUDGET 32 38 39 struct xsk_addr_node { 40 u64 addr; 41 struct list_head addr_node; 42 }; 43 44 struct xsk_addr_head { 45 u32 num_descs; 46 struct list_head addrs_list; 47 }; 48 49 static struct kmem_cache *xsk_tx_generic_cache; 50 51 #define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) 52 53 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 54 { 55 if (pool->cached_need_wakeup & XDP_WAKEUP_RX) 56 return; 57 58 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 59 pool->cached_need_wakeup |= XDP_WAKEUP_RX; 60 } 61 EXPORT_SYMBOL(xsk_set_rx_need_wakeup); 62 63 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) 64 { 65 struct xdp_sock *xs; 66 67 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 68 return; 69 70 rcu_read_lock(); 71 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 72 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 73 } 74 rcu_read_unlock(); 75 76 pool->cached_need_wakeup |= XDP_WAKEUP_TX; 77 } 78 EXPORT_SYMBOL(xsk_set_tx_need_wakeup); 79 80 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) 81 { 82 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) 83 return; 84 85 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 86 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; 87 } 88 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 89 90 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) 91 { 92 struct xdp_sock *xs; 93 94 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) 95 return; 96 97 rcu_read_lock(); 98 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 99 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 100 } 101 rcu_read_unlock(); 102 103 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; 104 } 105 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 106 107 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) 108 { 109 return pool->uses_need_wakeup; 110 } 111 EXPORT_SYMBOL(xsk_uses_need_wakeup); 112 113 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 114 u16 queue_id) 115 { 116 if (queue_id < dev->real_num_rx_queues) 117 return dev->_rx[queue_id].pool; 118 if (queue_id < dev->real_num_tx_queues) 119 return dev->_tx[queue_id].pool; 120 121 return NULL; 122 } 123 EXPORT_SYMBOL(xsk_get_pool_from_qid); 124 125 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 126 { 127 if (queue_id < dev->num_rx_queues) 128 dev->_rx[queue_id].pool = NULL; 129 if (queue_id < dev->num_tx_queues) 130 dev->_tx[queue_id].pool = NULL; 131 } 132 133 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do 134 * not know if the device has more tx queues than rx, or the opposite. 135 * This might also change during run time. 136 */ 137 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 138 u16 queue_id) 139 { 140 if (queue_id >= max_t(unsigned int, 141 dev->real_num_rx_queues, 142 dev->real_num_tx_queues)) 143 return -EINVAL; 144 145 if (queue_id < dev->real_num_rx_queues) 146 dev->_rx[queue_id].pool = pool; 147 if (queue_id < dev->real_num_tx_queues) 148 dev->_tx[queue_id].pool = pool; 149 150 return 0; 151 } 152 153 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, 154 u32 flags) 155 { 156 u64 addr; 157 int err; 158 159 addr = xp_get_handle(xskb, xskb->pool); 160 err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 161 if (err) { 162 xs->rx_queue_full++; 163 return err; 164 } 165 166 xp_release(xskb); 167 return 0; 168 } 169 170 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 171 { 172 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 173 u32 frags = xdp_buff_has_frags(xdp); 174 struct xdp_buff_xsk *pos, *tmp; 175 struct list_head *xskb_list; 176 u32 contd = 0; 177 int err; 178 179 if (frags) 180 contd = XDP_PKT_CONTD; 181 182 err = __xsk_rcv_zc(xs, xskb, len, contd); 183 if (err) 184 goto err; 185 if (likely(!frags)) 186 return 0; 187 188 xskb_list = &xskb->pool->xskb_list; 189 list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 190 if (list_is_singular(xskb_list)) 191 contd = 0; 192 len = pos->xdp.data_end - pos->xdp.data; 193 err = __xsk_rcv_zc(xs, pos, len, contd); 194 if (err) 195 goto err; 196 list_del(&pos->list_node); 197 } 198 199 return 0; 200 err: 201 xsk_buff_free(xdp); 202 return err; 203 } 204 205 static void *xsk_copy_xdp_start(struct xdp_buff *from) 206 { 207 if (unlikely(xdp_data_meta_unsupported(from))) 208 return from->data; 209 else 210 return from->data_meta; 211 } 212 213 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, 214 u32 *from_len, skb_frag_t **frag, u32 rem) 215 { 216 u32 copied = 0; 217 218 while (1) { 219 u32 copy_len = min_t(u32, *from_len, to_len); 220 221 memcpy(to, *from, copy_len); 222 copied += copy_len; 223 if (rem == copied) 224 return copied; 225 226 if (*from_len == copy_len) { 227 *from = skb_frag_address(*frag); 228 *from_len = skb_frag_size((*frag)++); 229 } else { 230 *from += copy_len; 231 *from_len -= copy_len; 232 } 233 if (to_len == copy_len) 234 return copied; 235 236 to_len -= copy_len; 237 to += copy_len; 238 } 239 } 240 241 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 242 { 243 u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); 244 void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; 245 u32 from_len, meta_len, rem, num_desc; 246 struct xdp_buff_xsk *xskb; 247 struct xdp_buff *xsk_xdp; 248 skb_frag_t *frag; 249 250 from_len = xdp->data_end - copy_from; 251 meta_len = xdp->data - copy_from; 252 rem = len + meta_len; 253 254 if (len <= frame_size && !xdp_buff_has_frags(xdp)) { 255 int err; 256 257 xsk_xdp = xsk_buff_alloc(xs->pool); 258 if (!xsk_xdp) { 259 xs->rx_dropped++; 260 return -ENOMEM; 261 } 262 memcpy(xsk_xdp->data - meta_len, copy_from, rem); 263 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 264 err = __xsk_rcv_zc(xs, xskb, len, 0); 265 if (err) { 266 xsk_buff_free(xsk_xdp); 267 return err; 268 } 269 270 return 0; 271 } 272 273 num_desc = (len - 1) / frame_size + 1; 274 275 if (!xsk_buff_can_alloc(xs->pool, num_desc)) { 276 xs->rx_dropped++; 277 return -ENOMEM; 278 } 279 if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 280 xs->rx_queue_full++; 281 return -ENOBUFS; 282 } 283 284 if (xdp_buff_has_frags(xdp)) { 285 struct skb_shared_info *sinfo; 286 287 sinfo = xdp_get_shared_info_from_buff(xdp); 288 frag = &sinfo->frags[0]; 289 } 290 291 do { 292 u32 to_len = frame_size + meta_len; 293 u32 copied; 294 295 xsk_xdp = xsk_buff_alloc(xs->pool); 296 copy_to = xsk_xdp->data - meta_len; 297 298 copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); 299 rem -= copied; 300 301 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 302 __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); 303 meta_len = 0; 304 } while (rem); 305 306 return 0; 307 } 308 309 static bool xsk_tx_writeable(struct xdp_sock *xs) 310 { 311 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) 312 return false; 313 314 return true; 315 } 316 317 static void __xsk_tx_release(struct xdp_sock *xs) 318 { 319 __xskq_cons_release(xs->tx); 320 if (xsk_tx_writeable(xs)) 321 xs->sk.sk_write_space(&xs->sk); 322 } 323 324 static bool xsk_is_bound(struct xdp_sock *xs) 325 { 326 if (READ_ONCE(xs->state) == XSK_BOUND) { 327 /* Matches smp_wmb() in bind(). */ 328 smp_rmb(); 329 return true; 330 } 331 return false; 332 } 333 334 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 335 { 336 if (!xsk_is_bound(xs)) 337 return -ENXIO; 338 339 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 340 return -EINVAL; 341 342 if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 343 xs->rx_dropped++; 344 return -ENOSPC; 345 } 346 347 return 0; 348 } 349 350 static void xsk_flush(struct xdp_sock *xs) 351 { 352 xskq_prod_submit(xs->rx); 353 __xskq_cons_release(xs->pool->fq); 354 sock_def_readable(&xs->sk); 355 } 356 357 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 358 { 359 u32 len = xdp_get_buff_len(xdp); 360 int err; 361 362 err = xsk_rcv_check(xs, xdp, len); 363 if (!err) { 364 spin_lock_bh(&xs->pool->rx_lock); 365 err = __xsk_rcv(xs, xdp, len); 366 xsk_flush(xs); 367 spin_unlock_bh(&xs->pool->rx_lock); 368 } 369 370 return err; 371 } 372 373 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 374 { 375 u32 len = xdp_get_buff_len(xdp); 376 int err; 377 378 err = xsk_rcv_check(xs, xdp, len); 379 if (err) 380 return err; 381 382 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { 383 len = xdp->data_end - xdp->data; 384 return xsk_rcv_zc(xs, xdp, len); 385 } 386 387 err = __xsk_rcv(xs, xdp, len); 388 if (!err) 389 xdp_return_buff(xdp); 390 return err; 391 } 392 393 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 394 { 395 int err; 396 397 err = xsk_rcv(xs, xdp); 398 if (err) 399 return err; 400 401 if (!xs->flush_node.prev) { 402 struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); 403 404 list_add(&xs->flush_node, flush_list); 405 } 406 407 return 0; 408 } 409 410 void __xsk_map_flush(struct list_head *flush_list) 411 { 412 struct xdp_sock *xs, *tmp; 413 414 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 415 xsk_flush(xs); 416 __list_del_clearprev(&xs->flush_node); 417 } 418 } 419 420 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) 421 { 422 xskq_prod_submit_n(pool->cq, nb_entries); 423 } 424 EXPORT_SYMBOL(xsk_tx_completed); 425 426 void xsk_tx_release(struct xsk_buff_pool *pool) 427 { 428 struct xdp_sock *xs; 429 430 rcu_read_lock(); 431 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 432 __xsk_tx_release(xs); 433 rcu_read_unlock(); 434 } 435 EXPORT_SYMBOL(xsk_tx_release); 436 437 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 438 { 439 bool budget_exhausted = false; 440 struct xdp_sock *xs; 441 442 rcu_read_lock(); 443 again: 444 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 445 if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { 446 budget_exhausted = true; 447 continue; 448 } 449 450 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { 451 if (xskq_has_descs(xs->tx)) 452 xskq_cons_release(xs->tx); 453 continue; 454 } 455 456 xs->tx_budget_spent++; 457 458 /* This is the backpressure mechanism for the Tx path. 459 * Reserve space in the completion queue and only proceed 460 * if there is space in it. This avoids having to implement 461 * any buffering in the Tx path. 462 */ 463 if (xskq_prod_reserve_addr(pool->cq, desc->addr)) 464 goto out; 465 466 xskq_cons_release(xs->tx); 467 rcu_read_unlock(); 468 return true; 469 } 470 471 if (budget_exhausted) { 472 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) 473 xs->tx_budget_spent = 0; 474 475 budget_exhausted = false; 476 goto again; 477 } 478 479 out: 480 rcu_read_unlock(); 481 return false; 482 } 483 EXPORT_SYMBOL(xsk_tx_peek_desc); 484 485 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) 486 { 487 struct xdp_desc *descs = pool->tx_descs; 488 u32 nb_pkts = 0; 489 490 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 491 nb_pkts++; 492 493 xsk_tx_release(pool); 494 return nb_pkts; 495 } 496 497 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) 498 { 499 struct xdp_sock *xs; 500 501 rcu_read_lock(); 502 if (!list_is_singular(&pool->xsk_tx_list)) { 503 /* Fallback to the non-batched version */ 504 rcu_read_unlock(); 505 return xsk_tx_peek_release_fallback(pool, nb_pkts); 506 } 507 508 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 509 if (!xs) { 510 nb_pkts = 0; 511 goto out; 512 } 513 514 nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); 515 516 /* This is the backpressure mechanism for the Tx path. Try to 517 * reserve space in the completion queue for all packets, but 518 * if there are fewer slots available, just process that many 519 * packets. This avoids having to implement any buffering in 520 * the Tx path. 521 */ 522 nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); 523 if (!nb_pkts) 524 goto out; 525 526 nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); 527 if (!nb_pkts) { 528 xs->tx->queue_empty_descs++; 529 goto out; 530 } 531 532 __xskq_cons_release(xs->tx); 533 xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); 534 xs->sk.sk_write_space(&xs->sk); 535 536 out: 537 rcu_read_unlock(); 538 return nb_pkts; 539 } 540 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 541 542 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 543 { 544 struct net_device *dev = xs->dev; 545 546 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 547 } 548 549 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) 550 { 551 unsigned long flags; 552 int ret; 553 554 spin_lock_irqsave(&pool->cq_lock, flags); 555 ret = xskq_prod_reserve(pool->cq); 556 spin_unlock_irqrestore(&pool->cq_lock, flags); 557 558 return ret; 559 } 560 561 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, 562 struct sk_buff *skb) 563 { 564 struct xsk_addr_node *pos, *tmp; 565 u32 descs_processed = 0; 566 unsigned long flags; 567 u32 idx; 568 569 spin_lock_irqsave(&pool->cq_lock, flags); 570 idx = xskq_get_prod(pool->cq); 571 572 xskq_prod_write_addr(pool->cq, idx, 573 (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); 574 descs_processed++; 575 576 if (unlikely(XSKCB(skb)->num_descs > 1)) { 577 list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { 578 xskq_prod_write_addr(pool->cq, idx + descs_processed, 579 pos->addr); 580 descs_processed++; 581 list_del(&pos->addr_node); 582 kmem_cache_free(xsk_tx_generic_cache, pos); 583 } 584 } 585 xskq_prod_submit_n(pool->cq, descs_processed); 586 spin_unlock_irqrestore(&pool->cq_lock, flags); 587 } 588 589 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) 590 { 591 unsigned long flags; 592 593 spin_lock_irqsave(&pool->cq_lock, flags); 594 xskq_prod_cancel_n(pool->cq, n); 595 spin_unlock_irqrestore(&pool->cq_lock, flags); 596 } 597 598 static void xsk_inc_num_desc(struct sk_buff *skb) 599 { 600 XSKCB(skb)->num_descs++; 601 } 602 603 static u32 xsk_get_num_desc(struct sk_buff *skb) 604 { 605 return XSKCB(skb)->num_descs; 606 } 607 608 static void xsk_destruct_skb(struct sk_buff *skb) 609 { 610 struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; 611 612 if (compl->tx_timestamp) { 613 /* sw completion timestamp, not a real one */ 614 *compl->tx_timestamp = ktime_get_tai_fast_ns(); 615 } 616 617 xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); 618 sock_wfree(skb); 619 } 620 621 static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) 622 { 623 BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); 624 INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); 625 XSKCB(skb)->num_descs = 0; 626 skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; 627 } 628 629 static void xsk_consume_skb(struct sk_buff *skb) 630 { 631 struct xdp_sock *xs = xdp_sk(skb->sk); 632 u32 num_descs = xsk_get_num_desc(skb); 633 struct xsk_addr_node *pos, *tmp; 634 635 if (unlikely(num_descs > 1)) { 636 list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { 637 list_del(&pos->addr_node); 638 kmem_cache_free(xsk_tx_generic_cache, pos); 639 } 640 } 641 642 skb->destructor = sock_wfree; 643 xsk_cq_cancel_locked(xs->pool, num_descs); 644 /* Free skb without triggering the perf drop trace */ 645 consume_skb(skb); 646 xs->skb = NULL; 647 } 648 649 static void xsk_drop_skb(struct sk_buff *skb) 650 { 651 xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); 652 xsk_consume_skb(skb); 653 } 654 655 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, 656 struct xdp_desc *desc) 657 { 658 struct xsk_buff_pool *pool = xs->pool; 659 u32 hr, len, ts, offset, copy, copied; 660 struct xsk_addr_node *xsk_addr; 661 struct sk_buff *skb = xs->skb; 662 struct page *page; 663 void *buffer; 664 int err, i; 665 u64 addr; 666 667 if (!skb) { 668 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); 669 670 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); 671 if (unlikely(!skb)) 672 return ERR_PTR(err); 673 674 skb_reserve(skb, hr); 675 676 xsk_set_destructor_arg(skb, desc->addr); 677 } else { 678 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 679 if (!xsk_addr) 680 return ERR_PTR(-ENOMEM); 681 682 /* in case of -EOVERFLOW that could happen below, 683 * xsk_consume_skb() will release this node as whole skb 684 * would be dropped, which implies freeing all list elements 685 */ 686 xsk_addr->addr = desc->addr; 687 list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); 688 } 689 690 addr = desc->addr; 691 len = desc->len; 692 ts = pool->unaligned ? len : pool->chunk_size; 693 694 buffer = xsk_buff_raw_get_data(pool, addr); 695 offset = offset_in_page(buffer); 696 addr = buffer - pool->addrs; 697 698 for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { 699 if (unlikely(i >= MAX_SKB_FRAGS)) 700 return ERR_PTR(-EOVERFLOW); 701 702 page = pool->umem->pgs[addr >> PAGE_SHIFT]; 703 get_page(page); 704 705 copy = min_t(u32, PAGE_SIZE - offset, len - copied); 706 skb_fill_page_desc(skb, i, page, offset, copy); 707 708 copied += copy; 709 addr += copy; 710 offset = 0; 711 } 712 713 skb->len += len; 714 skb->data_len += len; 715 skb->truesize += ts; 716 717 refcount_add(ts, &xs->sk.sk_wmem_alloc); 718 719 return skb; 720 } 721 722 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, 723 struct xdp_desc *desc) 724 { 725 struct xsk_tx_metadata *meta = NULL; 726 struct net_device *dev = xs->dev; 727 struct sk_buff *skb = xs->skb; 728 bool first_frag = false; 729 int err; 730 731 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { 732 skb = xsk_build_skb_zerocopy(xs, desc); 733 if (IS_ERR(skb)) { 734 err = PTR_ERR(skb); 735 goto free_err; 736 } 737 } else { 738 u32 hr, tr, len; 739 void *buffer; 740 741 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); 742 len = desc->len; 743 744 if (!skb) { 745 first_frag = true; 746 747 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); 748 tr = dev->needed_tailroom; 749 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); 750 if (unlikely(!skb)) 751 goto free_err; 752 753 skb_reserve(skb, hr); 754 skb_put(skb, len); 755 756 err = skb_store_bits(skb, 0, buffer, len); 757 if (unlikely(err)) 758 goto free_err; 759 760 xsk_set_destructor_arg(skb, desc->addr); 761 } else { 762 int nr_frags = skb_shinfo(skb)->nr_frags; 763 struct xsk_addr_node *xsk_addr; 764 struct page *page; 765 u8 *vaddr; 766 767 if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 768 err = -EOVERFLOW; 769 goto free_err; 770 } 771 772 page = alloc_page(xs->sk.sk_allocation); 773 if (unlikely(!page)) { 774 err = -EAGAIN; 775 goto free_err; 776 } 777 778 xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 779 if (!xsk_addr) { 780 __free_page(page); 781 err = -ENOMEM; 782 goto free_err; 783 } 784 785 vaddr = kmap_local_page(page); 786 memcpy(vaddr, buffer, len); 787 kunmap_local(vaddr); 788 789 skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 790 refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 791 792 xsk_addr->addr = desc->addr; 793 list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); 794 } 795 796 if (first_frag && desc->options & XDP_TX_METADATA) { 797 if (unlikely(xs->pool->tx_metadata_len == 0)) { 798 err = -EINVAL; 799 goto free_err; 800 } 801 802 meta = buffer - xs->pool->tx_metadata_len; 803 if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { 804 err = -EINVAL; 805 goto free_err; 806 } 807 808 if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { 809 if (unlikely(meta->request.csum_start + 810 meta->request.csum_offset + 811 sizeof(__sum16) > len)) { 812 err = -EINVAL; 813 goto free_err; 814 } 815 816 skb->csum_start = hr + meta->request.csum_start; 817 skb->csum_offset = meta->request.csum_offset; 818 skb->ip_summed = CHECKSUM_PARTIAL; 819 820 if (unlikely(xs->pool->tx_sw_csum)) { 821 err = skb_checksum_help(skb); 822 if (err) 823 goto free_err; 824 } 825 } 826 827 if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) 828 skb->skb_mstamp_ns = meta->request.launch_time; 829 } 830 } 831 832 skb->dev = dev; 833 skb->priority = READ_ONCE(xs->sk.sk_priority); 834 skb->mark = READ_ONCE(xs->sk.sk_mark); 835 skb->destructor = xsk_destruct_skb; 836 xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); 837 xsk_inc_num_desc(skb); 838 839 return skb; 840 841 free_err: 842 if (first_frag && skb) 843 kfree_skb(skb); 844 845 if (err == -EOVERFLOW) { 846 /* Drop the packet */ 847 xsk_inc_num_desc(xs->skb); 848 xsk_drop_skb(xs->skb); 849 xskq_cons_release(xs->tx); 850 } else { 851 /* Let application retry */ 852 xsk_cq_cancel_locked(xs->pool, 1); 853 } 854 855 return ERR_PTR(err); 856 } 857 858 static int __xsk_generic_xmit(struct sock *sk) 859 { 860 struct xdp_sock *xs = xdp_sk(sk); 861 bool sent_frame = false; 862 struct xdp_desc desc; 863 struct sk_buff *skb; 864 u32 max_batch; 865 int err = 0; 866 867 mutex_lock(&xs->mutex); 868 869 /* Since we dropped the RCU read lock, the socket state might have changed. */ 870 if (unlikely(!xsk_is_bound(xs))) { 871 err = -ENXIO; 872 goto out; 873 } 874 875 if (xs->queue_id >= xs->dev->real_num_tx_queues) 876 goto out; 877 878 max_batch = READ_ONCE(xs->max_tx_budget); 879 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { 880 if (max_batch-- == 0) { 881 err = -EAGAIN; 882 goto out; 883 } 884 885 /* This is the backpressure mechanism for the Tx path. 886 * Reserve space in the completion queue and only proceed 887 * if there is space in it. This avoids having to implement 888 * any buffering in the Tx path. 889 */ 890 err = xsk_cq_reserve_locked(xs->pool); 891 if (err) { 892 err = -EAGAIN; 893 goto out; 894 } 895 896 skb = xsk_build_skb(xs, &desc); 897 if (IS_ERR(skb)) { 898 err = PTR_ERR(skb); 899 if (err != -EOVERFLOW) 900 goto out; 901 err = 0; 902 continue; 903 } 904 905 xskq_cons_release(xs->tx); 906 907 if (xp_mb_desc(&desc)) { 908 xs->skb = skb; 909 continue; 910 } 911 912 err = __dev_direct_xmit(skb, xs->queue_id); 913 if (err == NETDEV_TX_BUSY) { 914 /* Tell user-space to retry the send */ 915 xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); 916 xsk_consume_skb(skb); 917 err = -EAGAIN; 918 goto out; 919 } 920 921 /* Ignore NET_XMIT_CN as packet might have been sent */ 922 if (err == NET_XMIT_DROP) { 923 /* SKB completed but not sent */ 924 err = -EBUSY; 925 xs->skb = NULL; 926 goto out; 927 } 928 929 sent_frame = true; 930 xs->skb = NULL; 931 } 932 933 if (xskq_has_descs(xs->tx)) { 934 if (xs->skb) 935 xsk_drop_skb(xs->skb); 936 xskq_cons_release(xs->tx); 937 } 938 939 out: 940 if (sent_frame) 941 __xsk_tx_release(xs); 942 943 mutex_unlock(&xs->mutex); 944 return err; 945 } 946 947 static int xsk_generic_xmit(struct sock *sk) 948 { 949 int ret; 950 951 /* Drop the RCU lock since the SKB path might sleep. */ 952 rcu_read_unlock(); 953 ret = __xsk_generic_xmit(sk); 954 /* Reaquire RCU lock before going into common code. */ 955 rcu_read_lock(); 956 957 return ret; 958 } 959 960 static bool xsk_no_wakeup(struct sock *sk) 961 { 962 #ifdef CONFIG_NET_RX_BUSY_POLL 963 /* Prefer busy-polling, skip the wakeup. */ 964 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 965 napi_id_valid(READ_ONCE(sk->sk_napi_id)); 966 #else 967 return false; 968 #endif 969 } 970 971 static int xsk_check_common(struct xdp_sock *xs) 972 { 973 if (unlikely(!xsk_is_bound(xs))) 974 return -ENXIO; 975 if (unlikely(!(xs->dev->flags & IFF_UP))) 976 return -ENETDOWN; 977 978 return 0; 979 } 980 981 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 982 { 983 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 984 struct sock *sk = sock->sk; 985 struct xdp_sock *xs = xdp_sk(sk); 986 struct xsk_buff_pool *pool; 987 int err; 988 989 err = xsk_check_common(xs); 990 if (err) 991 return err; 992 if (unlikely(need_wait)) 993 return -EOPNOTSUPP; 994 if (unlikely(!xs->tx)) 995 return -ENOBUFS; 996 997 if (sk_can_busy_loop(sk)) 998 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 999 1000 if (xs->zc && xsk_no_wakeup(sk)) 1001 return 0; 1002 1003 pool = xs->pool; 1004 if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { 1005 if (xs->zc) 1006 return xsk_wakeup(xs, XDP_WAKEUP_TX); 1007 return xsk_generic_xmit(sk); 1008 } 1009 return 0; 1010 } 1011 1012 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 1013 { 1014 int ret; 1015 1016 rcu_read_lock(); 1017 ret = __xsk_sendmsg(sock, m, total_len); 1018 rcu_read_unlock(); 1019 1020 return ret; 1021 } 1022 1023 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1024 { 1025 bool need_wait = !(flags & MSG_DONTWAIT); 1026 struct sock *sk = sock->sk; 1027 struct xdp_sock *xs = xdp_sk(sk); 1028 int err; 1029 1030 err = xsk_check_common(xs); 1031 if (err) 1032 return err; 1033 if (unlikely(!xs->rx)) 1034 return -ENOBUFS; 1035 if (unlikely(need_wait)) 1036 return -EOPNOTSUPP; 1037 1038 if (sk_can_busy_loop(sk)) 1039 sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 1040 1041 if (xsk_no_wakeup(sk)) 1042 return 0; 1043 1044 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 1045 return xsk_wakeup(xs, XDP_WAKEUP_RX); 1046 return 0; 1047 } 1048 1049 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 1050 { 1051 int ret; 1052 1053 rcu_read_lock(); 1054 ret = __xsk_recvmsg(sock, m, len, flags); 1055 rcu_read_unlock(); 1056 1057 return ret; 1058 } 1059 1060 static __poll_t xsk_poll(struct file *file, struct socket *sock, 1061 struct poll_table_struct *wait) 1062 { 1063 __poll_t mask = 0; 1064 struct sock *sk = sock->sk; 1065 struct xdp_sock *xs = xdp_sk(sk); 1066 struct xsk_buff_pool *pool; 1067 1068 sock_poll_wait(file, sock, wait); 1069 1070 rcu_read_lock(); 1071 if (xsk_check_common(xs)) 1072 goto out; 1073 1074 pool = xs->pool; 1075 1076 if (pool->cached_need_wakeup) { 1077 if (xs->zc) 1078 xsk_wakeup(xs, pool->cached_need_wakeup); 1079 else if (xs->tx) 1080 /* Poll needs to drive Tx also in copy mode */ 1081 xsk_generic_xmit(sk); 1082 } 1083 1084 if (xs->rx && !xskq_prod_is_empty(xs->rx)) 1085 mask |= EPOLLIN | EPOLLRDNORM; 1086 if (xs->tx && xsk_tx_writeable(xs)) 1087 mask |= EPOLLOUT | EPOLLWRNORM; 1088 out: 1089 rcu_read_unlock(); 1090 return mask; 1091 } 1092 1093 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 1094 bool umem_queue) 1095 { 1096 struct xsk_queue *q; 1097 1098 if (entries == 0 || *queue || !is_power_of_2(entries)) 1099 return -EINVAL; 1100 1101 q = xskq_create(entries, umem_queue); 1102 if (!q) 1103 return -ENOMEM; 1104 1105 /* Make sure queue is ready before it can be seen by others */ 1106 smp_wmb(); 1107 WRITE_ONCE(*queue, q); 1108 return 0; 1109 } 1110 1111 static void xsk_unbind_dev(struct xdp_sock *xs) 1112 { 1113 struct net_device *dev = xs->dev; 1114 1115 if (xs->state != XSK_BOUND) 1116 return; 1117 WRITE_ONCE(xs->state, XSK_UNBOUND); 1118 1119 /* Wait for driver to stop using the xdp socket. */ 1120 xp_del_xsk(xs->pool, xs); 1121 synchronize_net(); 1122 dev_put(dev); 1123 } 1124 1125 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 1126 struct xdp_sock __rcu ***map_entry) 1127 { 1128 struct xsk_map *map = NULL; 1129 struct xsk_map_node *node; 1130 1131 *map_entry = NULL; 1132 1133 spin_lock_bh(&xs->map_list_lock); 1134 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 1135 node); 1136 if (node) { 1137 bpf_map_inc(&node->map->map); 1138 map = node->map; 1139 *map_entry = node->map_entry; 1140 } 1141 spin_unlock_bh(&xs->map_list_lock); 1142 return map; 1143 } 1144 1145 static void xsk_delete_from_maps(struct xdp_sock *xs) 1146 { 1147 /* This function removes the current XDP socket from all the 1148 * maps it resides in. We need to take extra care here, due to 1149 * the two locks involved. Each map has a lock synchronizing 1150 * updates to the entries, and each socket has a lock that 1151 * synchronizes access to the list of maps (map_list). For 1152 * deadlock avoidance the locks need to be taken in the order 1153 * "map lock"->"socket map list lock". We start off by 1154 * accessing the socket map list, and take a reference to the 1155 * map to guarantee existence between the 1156 * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 1157 * calls. Then we ask the map to remove the socket, which 1158 * tries to remove the socket from the map. Note that there 1159 * might be updates to the map between 1160 * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 1161 */ 1162 struct xdp_sock __rcu **map_entry = NULL; 1163 struct xsk_map *map; 1164 1165 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 1166 xsk_map_try_sock_delete(map, xs, map_entry); 1167 bpf_map_put(&map->map); 1168 } 1169 } 1170 1171 static int xsk_release(struct socket *sock) 1172 { 1173 struct sock *sk = sock->sk; 1174 struct xdp_sock *xs = xdp_sk(sk); 1175 struct net *net; 1176 1177 if (!sk) 1178 return 0; 1179 1180 net = sock_net(sk); 1181 1182 if (xs->skb) 1183 xsk_drop_skb(xs->skb); 1184 1185 mutex_lock(&net->xdp.lock); 1186 sk_del_node_init_rcu(sk); 1187 mutex_unlock(&net->xdp.lock); 1188 1189 sock_prot_inuse_add(net, sk->sk_prot, -1); 1190 1191 xsk_delete_from_maps(xs); 1192 mutex_lock(&xs->mutex); 1193 xsk_unbind_dev(xs); 1194 mutex_unlock(&xs->mutex); 1195 1196 xskq_destroy(xs->rx); 1197 xskq_destroy(xs->tx); 1198 xskq_destroy(xs->fq_tmp); 1199 xskq_destroy(xs->cq_tmp); 1200 1201 sock_orphan(sk); 1202 sock->sk = NULL; 1203 1204 sock_put(sk); 1205 1206 return 0; 1207 } 1208 1209 static struct socket *xsk_lookup_xsk_from_fd(int fd) 1210 { 1211 struct socket *sock; 1212 int err; 1213 1214 sock = sockfd_lookup(fd, &err); 1215 if (!sock) 1216 return ERR_PTR(-ENOTSOCK); 1217 1218 if (sock->sk->sk_family != PF_XDP) { 1219 sockfd_put(sock); 1220 return ERR_PTR(-ENOPROTOOPT); 1221 } 1222 1223 return sock; 1224 } 1225 1226 static bool xsk_validate_queues(struct xdp_sock *xs) 1227 { 1228 return xs->fq_tmp && xs->cq_tmp; 1229 } 1230 1231 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 1232 { 1233 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 1234 struct sock *sk = sock->sk; 1235 struct xdp_sock *xs = xdp_sk(sk); 1236 struct net_device *dev; 1237 int bound_dev_if; 1238 u32 flags, qid; 1239 int err = 0; 1240 1241 if (addr_len < sizeof(struct sockaddr_xdp)) 1242 return -EINVAL; 1243 if (sxdp->sxdp_family != AF_XDP) 1244 return -EINVAL; 1245 1246 flags = sxdp->sxdp_flags; 1247 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 1248 XDP_USE_NEED_WAKEUP | XDP_USE_SG)) 1249 return -EINVAL; 1250 1251 bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 1252 if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) 1253 return -EINVAL; 1254 1255 rtnl_lock(); 1256 mutex_lock(&xs->mutex); 1257 if (xs->state != XSK_READY) { 1258 err = -EBUSY; 1259 goto out_release; 1260 } 1261 1262 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 1263 if (!dev) { 1264 err = -ENODEV; 1265 goto out_release; 1266 } 1267 1268 netdev_lock_ops(dev); 1269 1270 if (!xs->rx && !xs->tx) { 1271 err = -EINVAL; 1272 goto out_unlock; 1273 } 1274 1275 qid = sxdp->sxdp_queue_id; 1276 1277 if (flags & XDP_SHARED_UMEM) { 1278 struct xdp_sock *umem_xs; 1279 struct socket *sock; 1280 1281 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 1282 (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { 1283 /* Cannot specify flags for shared sockets. */ 1284 err = -EINVAL; 1285 goto out_unlock; 1286 } 1287 1288 if (xs->umem) { 1289 /* We have already our own. */ 1290 err = -EINVAL; 1291 goto out_unlock; 1292 } 1293 1294 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 1295 if (IS_ERR(sock)) { 1296 err = PTR_ERR(sock); 1297 goto out_unlock; 1298 } 1299 1300 umem_xs = xdp_sk(sock->sk); 1301 if (!xsk_is_bound(umem_xs)) { 1302 err = -EBADF; 1303 sockfd_put(sock); 1304 goto out_unlock; 1305 } 1306 1307 if (umem_xs->queue_id != qid || umem_xs->dev != dev) { 1308 /* Share the umem with another socket on another qid 1309 * and/or device. 1310 */ 1311 xs->pool = xp_create_and_assign_umem(xs, 1312 umem_xs->umem); 1313 if (!xs->pool) { 1314 err = -ENOMEM; 1315 sockfd_put(sock); 1316 goto out_unlock; 1317 } 1318 1319 err = xp_assign_dev_shared(xs->pool, umem_xs, dev, 1320 qid); 1321 if (err) { 1322 xp_destroy(xs->pool); 1323 xs->pool = NULL; 1324 sockfd_put(sock); 1325 goto out_unlock; 1326 } 1327 } else { 1328 /* Share the buffer pool with the other socket. */ 1329 if (xs->fq_tmp || xs->cq_tmp) { 1330 /* Do not allow setting your own fq or cq. */ 1331 err = -EINVAL; 1332 sockfd_put(sock); 1333 goto out_unlock; 1334 } 1335 1336 xp_get_pool(umem_xs->pool); 1337 xs->pool = umem_xs->pool; 1338 1339 /* If underlying shared umem was created without Tx 1340 * ring, allocate Tx descs array that Tx batching API 1341 * utilizes 1342 */ 1343 if (xs->tx && !xs->pool->tx_descs) { 1344 err = xp_alloc_tx_descs(xs->pool, xs); 1345 if (err) { 1346 xp_put_pool(xs->pool); 1347 xs->pool = NULL; 1348 sockfd_put(sock); 1349 goto out_unlock; 1350 } 1351 } 1352 } 1353 1354 xdp_get_umem(umem_xs->umem); 1355 WRITE_ONCE(xs->umem, umem_xs->umem); 1356 sockfd_put(sock); 1357 } else if (!xs->umem || !xsk_validate_queues(xs)) { 1358 err = -EINVAL; 1359 goto out_unlock; 1360 } else { 1361 /* This xsk has its own umem. */ 1362 xs->pool = xp_create_and_assign_umem(xs, xs->umem); 1363 if (!xs->pool) { 1364 err = -ENOMEM; 1365 goto out_unlock; 1366 } 1367 1368 err = xp_assign_dev(xs->pool, dev, qid, flags); 1369 if (err) { 1370 xp_destroy(xs->pool); 1371 xs->pool = NULL; 1372 goto out_unlock; 1373 } 1374 } 1375 1376 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 1377 xs->fq_tmp = NULL; 1378 xs->cq_tmp = NULL; 1379 1380 xs->dev = dev; 1381 xs->zc = xs->umem->zc; 1382 xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); 1383 xs->queue_id = qid; 1384 xp_add_xsk(xs->pool, xs); 1385 1386 if (qid < dev->real_num_rx_queues) { 1387 struct netdev_rx_queue *rxq; 1388 1389 rxq = __netif_get_rx_queue(dev, qid); 1390 if (rxq->napi) 1391 __sk_mark_napi_id_once(sk, rxq->napi->napi_id); 1392 } 1393 1394 out_unlock: 1395 if (err) { 1396 dev_put(dev); 1397 } else { 1398 /* Matches smp_rmb() in bind() for shared umem 1399 * sockets, and xsk_is_bound(). 1400 */ 1401 smp_wmb(); 1402 WRITE_ONCE(xs->state, XSK_BOUND); 1403 } 1404 netdev_unlock_ops(dev); 1405 out_release: 1406 mutex_unlock(&xs->mutex); 1407 rtnl_unlock(); 1408 return err; 1409 } 1410 1411 struct xdp_umem_reg_v1 { 1412 __u64 addr; /* Start of packet data area */ 1413 __u64 len; /* Length of packet data area */ 1414 __u32 chunk_size; 1415 __u32 headroom; 1416 }; 1417 1418 static int xsk_setsockopt(struct socket *sock, int level, int optname, 1419 sockptr_t optval, unsigned int optlen) 1420 { 1421 struct sock *sk = sock->sk; 1422 struct xdp_sock *xs = xdp_sk(sk); 1423 int err; 1424 1425 if (level != SOL_XDP) 1426 return -ENOPROTOOPT; 1427 1428 switch (optname) { 1429 case XDP_RX_RING: 1430 case XDP_TX_RING: 1431 { 1432 struct xsk_queue **q; 1433 int entries; 1434 1435 if (optlen < sizeof(entries)) 1436 return -EINVAL; 1437 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1438 return -EFAULT; 1439 1440 mutex_lock(&xs->mutex); 1441 if (xs->state != XSK_READY) { 1442 mutex_unlock(&xs->mutex); 1443 return -EBUSY; 1444 } 1445 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 1446 err = xsk_init_queue(entries, q, false); 1447 if (!err && optname == XDP_TX_RING) 1448 /* Tx needs to be explicitly woken up the first time */ 1449 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 1450 mutex_unlock(&xs->mutex); 1451 return err; 1452 } 1453 case XDP_UMEM_REG: 1454 { 1455 size_t mr_size = sizeof(struct xdp_umem_reg); 1456 struct xdp_umem_reg mr = {}; 1457 struct xdp_umem *umem; 1458 1459 if (optlen < sizeof(struct xdp_umem_reg_v1)) 1460 return -EINVAL; 1461 else if (optlen < sizeof(mr)) 1462 mr_size = sizeof(struct xdp_umem_reg_v1); 1463 1464 BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); 1465 1466 /* Make sure the last field of the struct doesn't have 1467 * uninitialized padding. All padding has to be explicit 1468 * and has to be set to zero by the userspace to make 1469 * struct xdp_umem_reg extensible in the future. 1470 */ 1471 BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + 1472 sizeof_field(struct xdp_umem_reg, tx_metadata_len) != 1473 sizeof(struct xdp_umem_reg)); 1474 1475 if (copy_from_sockptr(&mr, optval, mr_size)) 1476 return -EFAULT; 1477 1478 mutex_lock(&xs->mutex); 1479 if (xs->state != XSK_READY || xs->umem) { 1480 mutex_unlock(&xs->mutex); 1481 return -EBUSY; 1482 } 1483 1484 umem = xdp_umem_create(&mr); 1485 if (IS_ERR(umem)) { 1486 mutex_unlock(&xs->mutex); 1487 return PTR_ERR(umem); 1488 } 1489 1490 /* Make sure umem is ready before it can be seen by others */ 1491 smp_wmb(); 1492 WRITE_ONCE(xs->umem, umem); 1493 mutex_unlock(&xs->mutex); 1494 return 0; 1495 } 1496 case XDP_UMEM_FILL_RING: 1497 case XDP_UMEM_COMPLETION_RING: 1498 { 1499 struct xsk_queue **q; 1500 int entries; 1501 1502 if (optlen < sizeof(entries)) 1503 return -EINVAL; 1504 if (copy_from_sockptr(&entries, optval, sizeof(entries))) 1505 return -EFAULT; 1506 1507 mutex_lock(&xs->mutex); 1508 if (xs->state != XSK_READY) { 1509 mutex_unlock(&xs->mutex); 1510 return -EBUSY; 1511 } 1512 1513 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : 1514 &xs->cq_tmp; 1515 err = xsk_init_queue(entries, q, true); 1516 mutex_unlock(&xs->mutex); 1517 return err; 1518 } 1519 case XDP_MAX_TX_SKB_BUDGET: 1520 { 1521 unsigned int budget; 1522 1523 if (optlen != sizeof(budget)) 1524 return -EINVAL; 1525 if (copy_from_sockptr(&budget, optval, sizeof(budget))) 1526 return -EFAULT; 1527 if (!xs->tx || 1528 budget < TX_BATCH_SIZE || budget > xs->tx->nentries) 1529 return -EACCES; 1530 1531 WRITE_ONCE(xs->max_tx_budget, budget); 1532 return 0; 1533 } 1534 default: 1535 break; 1536 } 1537 1538 return -ENOPROTOOPT; 1539 } 1540 1541 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 1542 { 1543 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 1544 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 1545 ring->desc = offsetof(struct xdp_rxtx_ring, desc); 1546 } 1547 1548 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 1549 { 1550 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 1551 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 1552 ring->desc = offsetof(struct xdp_umem_ring, desc); 1553 } 1554 1555 struct xdp_statistics_v1 { 1556 __u64 rx_dropped; 1557 __u64 rx_invalid_descs; 1558 __u64 tx_invalid_descs; 1559 }; 1560 1561 static int xsk_getsockopt(struct socket *sock, int level, int optname, 1562 char __user *optval, int __user *optlen) 1563 { 1564 struct sock *sk = sock->sk; 1565 struct xdp_sock *xs = xdp_sk(sk); 1566 int len; 1567 1568 if (level != SOL_XDP) 1569 return -ENOPROTOOPT; 1570 1571 if (get_user(len, optlen)) 1572 return -EFAULT; 1573 if (len < 0) 1574 return -EINVAL; 1575 1576 switch (optname) { 1577 case XDP_STATISTICS: 1578 { 1579 struct xdp_statistics stats = {}; 1580 bool extra_stats = true; 1581 size_t stats_size; 1582 1583 if (len < sizeof(struct xdp_statistics_v1)) { 1584 return -EINVAL; 1585 } else if (len < sizeof(stats)) { 1586 extra_stats = false; 1587 stats_size = sizeof(struct xdp_statistics_v1); 1588 } else { 1589 stats_size = sizeof(stats); 1590 } 1591 1592 mutex_lock(&xs->mutex); 1593 stats.rx_dropped = xs->rx_dropped; 1594 if (extra_stats) { 1595 stats.rx_ring_full = xs->rx_queue_full; 1596 stats.rx_fill_ring_empty_descs = 1597 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; 1598 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); 1599 } else { 1600 stats.rx_dropped += xs->rx_queue_full; 1601 } 1602 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 1603 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 1604 mutex_unlock(&xs->mutex); 1605 1606 if (copy_to_user(optval, &stats, stats_size)) 1607 return -EFAULT; 1608 if (put_user(stats_size, optlen)) 1609 return -EFAULT; 1610 1611 return 0; 1612 } 1613 case XDP_MMAP_OFFSETS: 1614 { 1615 struct xdp_mmap_offsets off; 1616 struct xdp_mmap_offsets_v1 off_v1; 1617 bool flags_supported = true; 1618 void *to_copy; 1619 1620 if (len < sizeof(off_v1)) 1621 return -EINVAL; 1622 else if (len < sizeof(off)) 1623 flags_supported = false; 1624 1625 if (flags_supported) { 1626 /* xdp_ring_offset is identical to xdp_ring_offset_v1 1627 * except for the flags field added to the end. 1628 */ 1629 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1630 &off.rx); 1631 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 1632 &off.tx); 1633 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1634 &off.fr); 1635 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 1636 &off.cr); 1637 off.rx.flags = offsetof(struct xdp_rxtx_ring, 1638 ptrs.flags); 1639 off.tx.flags = offsetof(struct xdp_rxtx_ring, 1640 ptrs.flags); 1641 off.fr.flags = offsetof(struct xdp_umem_ring, 1642 ptrs.flags); 1643 off.cr.flags = offsetof(struct xdp_umem_ring, 1644 ptrs.flags); 1645 1646 len = sizeof(off); 1647 to_copy = &off; 1648 } else { 1649 xsk_enter_rxtx_offsets(&off_v1.rx); 1650 xsk_enter_rxtx_offsets(&off_v1.tx); 1651 xsk_enter_umem_offsets(&off_v1.fr); 1652 xsk_enter_umem_offsets(&off_v1.cr); 1653 1654 len = sizeof(off_v1); 1655 to_copy = &off_v1; 1656 } 1657 1658 if (copy_to_user(optval, to_copy, len)) 1659 return -EFAULT; 1660 if (put_user(len, optlen)) 1661 return -EFAULT; 1662 1663 return 0; 1664 } 1665 case XDP_OPTIONS: 1666 { 1667 struct xdp_options opts = {}; 1668 1669 if (len < sizeof(opts)) 1670 return -EINVAL; 1671 1672 mutex_lock(&xs->mutex); 1673 if (xs->zc) 1674 opts.flags |= XDP_OPTIONS_ZEROCOPY; 1675 mutex_unlock(&xs->mutex); 1676 1677 len = sizeof(opts); 1678 if (copy_to_user(optval, &opts, len)) 1679 return -EFAULT; 1680 if (put_user(len, optlen)) 1681 return -EFAULT; 1682 1683 return 0; 1684 } 1685 default: 1686 break; 1687 } 1688 1689 return -EOPNOTSUPP; 1690 } 1691 1692 static int xsk_mmap(struct file *file, struct socket *sock, 1693 struct vm_area_struct *vma) 1694 { 1695 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 1696 unsigned long size = vma->vm_end - vma->vm_start; 1697 struct xdp_sock *xs = xdp_sk(sock->sk); 1698 int state = READ_ONCE(xs->state); 1699 struct xsk_queue *q = NULL; 1700 1701 if (state != XSK_READY && state != XSK_BOUND) 1702 return -EBUSY; 1703 1704 if (offset == XDP_PGOFF_RX_RING) { 1705 q = READ_ONCE(xs->rx); 1706 } else if (offset == XDP_PGOFF_TX_RING) { 1707 q = READ_ONCE(xs->tx); 1708 } else { 1709 /* Matches the smp_wmb() in XDP_UMEM_REG */ 1710 smp_rmb(); 1711 if (offset == XDP_UMEM_PGOFF_FILL_RING) 1712 q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : 1713 READ_ONCE(xs->pool->fq); 1714 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 1715 q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : 1716 READ_ONCE(xs->pool->cq); 1717 } 1718 1719 if (!q) 1720 return -EINVAL; 1721 1722 /* Matches the smp_wmb() in xsk_init_queue */ 1723 smp_rmb(); 1724 if (size > q->ring_vmalloc_size) 1725 return -EINVAL; 1726 1727 return remap_vmalloc_range(vma, q->ring, 0); 1728 } 1729 1730 static int xsk_notifier(struct notifier_block *this, 1731 unsigned long msg, void *ptr) 1732 { 1733 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1734 struct net *net = dev_net(dev); 1735 struct sock *sk; 1736 1737 switch (msg) { 1738 case NETDEV_UNREGISTER: 1739 mutex_lock(&net->xdp.lock); 1740 sk_for_each(sk, &net->xdp.list) { 1741 struct xdp_sock *xs = xdp_sk(sk); 1742 1743 mutex_lock(&xs->mutex); 1744 if (xs->dev == dev) { 1745 sk->sk_err = ENETDOWN; 1746 if (!sock_flag(sk, SOCK_DEAD)) 1747 sk_error_report(sk); 1748 1749 xsk_unbind_dev(xs); 1750 1751 /* Clear device references. */ 1752 xp_clear_dev(xs->pool); 1753 } 1754 mutex_unlock(&xs->mutex); 1755 } 1756 mutex_unlock(&net->xdp.lock); 1757 break; 1758 } 1759 return NOTIFY_DONE; 1760 } 1761 1762 static struct proto xsk_proto = { 1763 .name = "XDP", 1764 .owner = THIS_MODULE, 1765 .obj_size = sizeof(struct xdp_sock), 1766 }; 1767 1768 static const struct proto_ops xsk_proto_ops = { 1769 .family = PF_XDP, 1770 .owner = THIS_MODULE, 1771 .release = xsk_release, 1772 .bind = xsk_bind, 1773 .connect = sock_no_connect, 1774 .socketpair = sock_no_socketpair, 1775 .accept = sock_no_accept, 1776 .getname = sock_no_getname, 1777 .poll = xsk_poll, 1778 .ioctl = sock_no_ioctl, 1779 .listen = sock_no_listen, 1780 .shutdown = sock_no_shutdown, 1781 .setsockopt = xsk_setsockopt, 1782 .getsockopt = xsk_getsockopt, 1783 .sendmsg = xsk_sendmsg, 1784 .recvmsg = xsk_recvmsg, 1785 .mmap = xsk_mmap, 1786 }; 1787 1788 static void xsk_destruct(struct sock *sk) 1789 { 1790 struct xdp_sock *xs = xdp_sk(sk); 1791 1792 if (!sock_flag(sk, SOCK_DEAD)) 1793 return; 1794 1795 if (!xp_put_pool(xs->pool)) 1796 xdp_put_umem(xs->umem, !xs->pool); 1797 } 1798 1799 static int xsk_create(struct net *net, struct socket *sock, int protocol, 1800 int kern) 1801 { 1802 struct xdp_sock *xs; 1803 struct sock *sk; 1804 1805 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1806 return -EPERM; 1807 if (sock->type != SOCK_RAW) 1808 return -ESOCKTNOSUPPORT; 1809 1810 if (protocol) 1811 return -EPROTONOSUPPORT; 1812 1813 sock->state = SS_UNCONNECTED; 1814 1815 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 1816 if (!sk) 1817 return -ENOBUFS; 1818 1819 sock->ops = &xsk_proto_ops; 1820 1821 sock_init_data(sock, sk); 1822 1823 sk->sk_family = PF_XDP; 1824 1825 sk->sk_destruct = xsk_destruct; 1826 1827 sock_set_flag(sk, SOCK_RCU_FREE); 1828 1829 xs = xdp_sk(sk); 1830 xs->state = XSK_READY; 1831 xs->max_tx_budget = TX_BATCH_SIZE; 1832 mutex_init(&xs->mutex); 1833 1834 INIT_LIST_HEAD(&xs->map_list); 1835 spin_lock_init(&xs->map_list_lock); 1836 1837 mutex_lock(&net->xdp.lock); 1838 sk_add_node_rcu(sk, &net->xdp.list); 1839 mutex_unlock(&net->xdp.lock); 1840 1841 sock_prot_inuse_add(net, &xsk_proto, 1); 1842 1843 return 0; 1844 } 1845 1846 static const struct net_proto_family xsk_family_ops = { 1847 .family = PF_XDP, 1848 .create = xsk_create, 1849 .owner = THIS_MODULE, 1850 }; 1851 1852 static struct notifier_block xsk_netdev_notifier = { 1853 .notifier_call = xsk_notifier, 1854 }; 1855 1856 static int __net_init xsk_net_init(struct net *net) 1857 { 1858 mutex_init(&net->xdp.lock); 1859 INIT_HLIST_HEAD(&net->xdp.list); 1860 return 0; 1861 } 1862 1863 static void __net_exit xsk_net_exit(struct net *net) 1864 { 1865 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 1866 } 1867 1868 static struct pernet_operations xsk_net_ops = { 1869 .init = xsk_net_init, 1870 .exit = xsk_net_exit, 1871 }; 1872 1873 static int __init xsk_init(void) 1874 { 1875 int err; 1876 1877 err = proto_register(&xsk_proto, 0 /* no slab */); 1878 if (err) 1879 goto out; 1880 1881 err = sock_register(&xsk_family_ops); 1882 if (err) 1883 goto out_proto; 1884 1885 err = register_pernet_subsys(&xsk_net_ops); 1886 if (err) 1887 goto out_sk; 1888 1889 err = register_netdevice_notifier(&xsk_netdev_notifier); 1890 if (err) 1891 goto out_pernet; 1892 1893 xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", 1894 sizeof(struct xsk_addr_node), 1895 0, SLAB_HWCACHE_ALIGN, NULL); 1896 if (!xsk_tx_generic_cache) { 1897 err = -ENOMEM; 1898 goto out_unreg_notif; 1899 } 1900 1901 return 0; 1902 1903 out_unreg_notif: 1904 unregister_netdevice_notifier(&xsk_netdev_notifier); 1905 out_pernet: 1906 unregister_pernet_subsys(&xsk_net_ops); 1907 out_sk: 1908 sock_unregister(PF_XDP); 1909 out_proto: 1910 proto_unregister(&xsk_proto); 1911 out: 1912 return err; 1913 } 1914 1915 fs_initcall(xsk_init); 1916