1 // SPDX-License-Identifier: GPL-2.0 2 /* XDP sockets 3 * 4 * AF_XDP sockets allows a channel between XDP programs and userspace 5 * applications. 6 * Copyright(c) 2018 Intel Corporation. 7 * 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 9 * Magnus Karlsson <magnus.karlsson@intel.com> 10 */ 11 12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 13 14 #include <linux/if_xdp.h> 15 #include <linux/init.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/sched/task.h> 19 #include <linux/socket.h> 20 #include <linux/file.h> 21 #include <linux/uaccess.h> 22 #include <linux/net.h> 23 #include <linux/netdevice.h> 24 #include <linux/rculist.h> 25 #include <net/xdp_sock.h> 26 #include <net/xdp.h> 27 28 #include "xsk_queue.h" 29 #include "xdp_umem.h" 30 #include "xsk.h" 31 32 #define TX_BATCH_SIZE 16 33 34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) 35 { 36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && 37 READ_ONCE(xs->umem->fq); 38 } 39 40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) 41 { 42 return xskq_has_addrs(umem->fq, cnt); 43 } 44 EXPORT_SYMBOL(xsk_umem_has_addrs); 45 46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) 47 { 48 return xskq_peek_addr(umem->fq, addr); 49 } 50 EXPORT_SYMBOL(xsk_umem_peek_addr); 51 52 void xsk_umem_discard_addr(struct xdp_umem *umem) 53 { 54 xskq_discard_addr(umem->fq); 55 } 56 EXPORT_SYMBOL(xsk_umem_discard_addr); 57 58 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 59 { 60 void *to_buf, *from_buf; 61 u32 metalen; 62 u64 addr; 63 int err; 64 65 if (!xskq_peek_addr(xs->umem->fq, &addr) || 66 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 67 xs->rx_dropped++; 68 return -ENOSPC; 69 } 70 71 addr += xs->umem->headroom; 72 73 if (unlikely(xdp_data_meta_unsupported(xdp))) { 74 from_buf = xdp->data; 75 metalen = 0; 76 } else { 77 from_buf = xdp->data_meta; 78 metalen = xdp->data - xdp->data_meta; 79 } 80 81 to_buf = xdp_umem_get_data(xs->umem, addr); 82 memcpy(to_buf, from_buf, len + metalen); 83 addr += metalen; 84 err = xskq_produce_batch_desc(xs->rx, addr, len); 85 if (!err) { 86 xskq_discard_addr(xs->umem->fq); 87 xdp_return_buff(xdp); 88 return 0; 89 } 90 91 xs->rx_dropped++; 92 return err; 93 } 94 95 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 96 { 97 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); 98 99 if (err) 100 xs->rx_dropped++; 101 102 return err; 103 } 104 105 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 106 { 107 u32 len; 108 109 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 110 return -EINVAL; 111 112 len = xdp->data_end - xdp->data; 113 114 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? 115 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); 116 } 117 118 void xsk_flush(struct xdp_sock *xs) 119 { 120 xskq_produce_flush_desc(xs->rx); 121 xs->sk.sk_data_ready(&xs->sk); 122 } 123 124 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 125 { 126 u32 metalen = xdp->data - xdp->data_meta; 127 u32 len = xdp->data_end - xdp->data; 128 void *buffer; 129 u64 addr; 130 int err; 131 132 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 133 return -EINVAL; 134 135 if (!xskq_peek_addr(xs->umem->fq, &addr) || 136 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 137 xs->rx_dropped++; 138 return -ENOSPC; 139 } 140 141 addr += xs->umem->headroom; 142 143 buffer = xdp_umem_get_data(xs->umem, addr); 144 memcpy(buffer, xdp->data_meta, len + metalen); 145 addr += metalen; 146 err = xskq_produce_batch_desc(xs->rx, addr, len); 147 if (!err) { 148 xskq_discard_addr(xs->umem->fq); 149 xsk_flush(xs); 150 return 0; 151 } 152 153 xs->rx_dropped++; 154 return err; 155 } 156 157 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) 158 { 159 xskq_produce_flush_addr_n(umem->cq, nb_entries); 160 } 161 EXPORT_SYMBOL(xsk_umem_complete_tx); 162 163 void xsk_umem_consume_tx_done(struct xdp_umem *umem) 164 { 165 struct xdp_sock *xs; 166 167 rcu_read_lock(); 168 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 169 xs->sk.sk_write_space(&xs->sk); 170 } 171 rcu_read_unlock(); 172 } 173 EXPORT_SYMBOL(xsk_umem_consume_tx_done); 174 175 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) 176 { 177 struct xdp_sock *xs; 178 179 rcu_read_lock(); 180 list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 181 if (!xskq_peek_desc(xs->tx, desc)) 182 continue; 183 184 if (xskq_produce_addr_lazy(umem->cq, desc->addr)) 185 goto out; 186 187 xskq_discard_desc(xs->tx); 188 rcu_read_unlock(); 189 return true; 190 } 191 192 out: 193 rcu_read_unlock(); 194 return false; 195 } 196 EXPORT_SYMBOL(xsk_umem_consume_tx); 197 198 static int xsk_zc_xmit(struct sock *sk) 199 { 200 struct xdp_sock *xs = xdp_sk(sk); 201 struct net_device *dev = xs->dev; 202 203 return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); 204 } 205 206 static void xsk_destruct_skb(struct sk_buff *skb) 207 { 208 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 209 struct xdp_sock *xs = xdp_sk(skb->sk); 210 unsigned long flags; 211 212 spin_lock_irqsave(&xs->tx_completion_lock, flags); 213 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); 214 spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 215 216 sock_wfree(skb); 217 } 218 219 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, 220 size_t total_len) 221 { 222 u32 max_batch = TX_BATCH_SIZE; 223 struct xdp_sock *xs = xdp_sk(sk); 224 bool sent_frame = false; 225 struct xdp_desc desc; 226 struct sk_buff *skb; 227 int err = 0; 228 229 mutex_lock(&xs->mutex); 230 231 while (xskq_peek_desc(xs->tx, &desc)) { 232 char *buffer; 233 u64 addr; 234 u32 len; 235 236 if (max_batch-- == 0) { 237 err = -EAGAIN; 238 goto out; 239 } 240 241 if (xskq_reserve_addr(xs->umem->cq)) 242 goto out; 243 244 if (xs->queue_id >= xs->dev->real_num_tx_queues) 245 goto out; 246 247 len = desc.len; 248 skb = sock_alloc_send_skb(sk, len, 1, &err); 249 if (unlikely(!skb)) { 250 err = -EAGAIN; 251 goto out; 252 } 253 254 skb_put(skb, len); 255 addr = desc.addr; 256 buffer = xdp_umem_get_data(xs->umem, addr); 257 err = skb_store_bits(skb, 0, buffer, len); 258 if (unlikely(err)) { 259 kfree_skb(skb); 260 goto out; 261 } 262 263 skb->dev = xs->dev; 264 skb->priority = sk->sk_priority; 265 skb->mark = sk->sk_mark; 266 skb_shinfo(skb)->destructor_arg = (void *)(long)addr; 267 skb->destructor = xsk_destruct_skb; 268 269 err = dev_direct_xmit(skb, xs->queue_id); 270 xskq_discard_desc(xs->tx); 271 /* Ignore NET_XMIT_CN as packet might have been sent */ 272 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 273 /* SKB completed but not sent */ 274 err = -EBUSY; 275 goto out; 276 } 277 278 sent_frame = true; 279 } 280 281 out: 282 if (sent_frame) 283 sk->sk_write_space(sk); 284 285 mutex_unlock(&xs->mutex); 286 return err; 287 } 288 289 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 290 { 291 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 292 struct sock *sk = sock->sk; 293 struct xdp_sock *xs = xdp_sk(sk); 294 295 if (unlikely(!xs->dev)) 296 return -ENXIO; 297 if (unlikely(!(xs->dev->flags & IFF_UP))) 298 return -ENETDOWN; 299 if (unlikely(!xs->tx)) 300 return -ENOBUFS; 301 if (need_wait) 302 return -EOPNOTSUPP; 303 304 return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); 305 } 306 307 static unsigned int xsk_poll(struct file *file, struct socket *sock, 308 struct poll_table_struct *wait) 309 { 310 unsigned int mask = datagram_poll(file, sock, wait); 311 struct sock *sk = sock->sk; 312 struct xdp_sock *xs = xdp_sk(sk); 313 314 if (xs->rx && !xskq_empty_desc(xs->rx)) 315 mask |= POLLIN | POLLRDNORM; 316 if (xs->tx && !xskq_full_desc(xs->tx)) 317 mask |= POLLOUT | POLLWRNORM; 318 319 return mask; 320 } 321 322 static int xsk_init_queue(u32 entries, struct xsk_queue **queue, 323 bool umem_queue) 324 { 325 struct xsk_queue *q; 326 327 if (entries == 0 || *queue || !is_power_of_2(entries)) 328 return -EINVAL; 329 330 q = xskq_create(entries, umem_queue); 331 if (!q) 332 return -ENOMEM; 333 334 /* Make sure queue is ready before it can be seen by others */ 335 smp_wmb(); 336 *queue = q; 337 return 0; 338 } 339 340 static int xsk_release(struct socket *sock) 341 { 342 struct sock *sk = sock->sk; 343 struct xdp_sock *xs = xdp_sk(sk); 344 struct net *net; 345 346 if (!sk) 347 return 0; 348 349 net = sock_net(sk); 350 351 mutex_lock(&net->xdp.lock); 352 sk_del_node_init_rcu(sk); 353 mutex_unlock(&net->xdp.lock); 354 355 local_bh_disable(); 356 sock_prot_inuse_add(net, sk->sk_prot, -1); 357 local_bh_enable(); 358 359 if (xs->dev) { 360 struct net_device *dev = xs->dev; 361 362 /* Wait for driver to stop using the xdp socket. */ 363 xdp_del_sk_umem(xs->umem, xs); 364 xs->dev = NULL; 365 synchronize_net(); 366 dev_put(dev); 367 } 368 369 xskq_destroy(xs->rx); 370 xskq_destroy(xs->tx); 371 372 sock_orphan(sk); 373 sock->sk = NULL; 374 375 sk_refcnt_debug_release(sk); 376 sock_put(sk); 377 378 return 0; 379 } 380 381 static struct socket *xsk_lookup_xsk_from_fd(int fd) 382 { 383 struct socket *sock; 384 int err; 385 386 sock = sockfd_lookup(fd, &err); 387 if (!sock) 388 return ERR_PTR(-ENOTSOCK); 389 390 if (sock->sk->sk_family != PF_XDP) { 391 sockfd_put(sock); 392 return ERR_PTR(-ENOPROTOOPT); 393 } 394 395 return sock; 396 } 397 398 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 399 { 400 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 401 struct sock *sk = sock->sk; 402 struct xdp_sock *xs = xdp_sk(sk); 403 struct net_device *dev; 404 u32 flags, qid; 405 int err = 0; 406 407 if (addr_len < sizeof(struct sockaddr_xdp)) 408 return -EINVAL; 409 if (sxdp->sxdp_family != AF_XDP) 410 return -EINVAL; 411 412 flags = sxdp->sxdp_flags; 413 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY)) 414 return -EINVAL; 415 416 mutex_lock(&xs->mutex); 417 if (xs->dev) { 418 err = -EBUSY; 419 goto out_release; 420 } 421 422 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 423 if (!dev) { 424 err = -ENODEV; 425 goto out_release; 426 } 427 428 if (!xs->rx && !xs->tx) { 429 err = -EINVAL; 430 goto out_unlock; 431 } 432 433 qid = sxdp->sxdp_queue_id; 434 435 if (flags & XDP_SHARED_UMEM) { 436 struct xdp_sock *umem_xs; 437 struct socket *sock; 438 439 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { 440 /* Cannot specify flags for shared sockets. */ 441 err = -EINVAL; 442 goto out_unlock; 443 } 444 445 if (xs->umem) { 446 /* We have already our own. */ 447 err = -EINVAL; 448 goto out_unlock; 449 } 450 451 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 452 if (IS_ERR(sock)) { 453 err = PTR_ERR(sock); 454 goto out_unlock; 455 } 456 457 umem_xs = xdp_sk(sock->sk); 458 if (!umem_xs->umem) { 459 /* No umem to inherit. */ 460 err = -EBADF; 461 sockfd_put(sock); 462 goto out_unlock; 463 } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { 464 err = -EINVAL; 465 sockfd_put(sock); 466 goto out_unlock; 467 } 468 469 xdp_get_umem(umem_xs->umem); 470 xs->umem = umem_xs->umem; 471 sockfd_put(sock); 472 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { 473 err = -EINVAL; 474 goto out_unlock; 475 } else { 476 /* This xsk has its own umem. */ 477 xskq_set_umem(xs->umem->fq, xs->umem->size, 478 xs->umem->chunk_mask); 479 xskq_set_umem(xs->umem->cq, xs->umem->size, 480 xs->umem->chunk_mask); 481 482 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); 483 if (err) 484 goto out_unlock; 485 } 486 487 xs->dev = dev; 488 xs->zc = xs->umem->zc; 489 xs->queue_id = qid; 490 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); 491 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); 492 xdp_add_sk_umem(xs->umem, xs); 493 494 out_unlock: 495 if (err) 496 dev_put(dev); 497 out_release: 498 mutex_unlock(&xs->mutex); 499 return err; 500 } 501 502 static int xsk_setsockopt(struct socket *sock, int level, int optname, 503 char __user *optval, unsigned int optlen) 504 { 505 struct sock *sk = sock->sk; 506 struct xdp_sock *xs = xdp_sk(sk); 507 int err; 508 509 if (level != SOL_XDP) 510 return -ENOPROTOOPT; 511 512 switch (optname) { 513 case XDP_RX_RING: 514 case XDP_TX_RING: 515 { 516 struct xsk_queue **q; 517 int entries; 518 519 if (optlen < sizeof(entries)) 520 return -EINVAL; 521 if (copy_from_user(&entries, optval, sizeof(entries))) 522 return -EFAULT; 523 524 mutex_lock(&xs->mutex); 525 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 526 err = xsk_init_queue(entries, q, false); 527 mutex_unlock(&xs->mutex); 528 return err; 529 } 530 case XDP_UMEM_REG: 531 { 532 struct xdp_umem_reg mr; 533 struct xdp_umem *umem; 534 535 if (copy_from_user(&mr, optval, sizeof(mr))) 536 return -EFAULT; 537 538 mutex_lock(&xs->mutex); 539 if (xs->umem) { 540 mutex_unlock(&xs->mutex); 541 return -EBUSY; 542 } 543 544 umem = xdp_umem_create(&mr); 545 if (IS_ERR(umem)) { 546 mutex_unlock(&xs->mutex); 547 return PTR_ERR(umem); 548 } 549 550 /* Make sure umem is ready before it can be seen by others */ 551 smp_wmb(); 552 xs->umem = umem; 553 mutex_unlock(&xs->mutex); 554 return 0; 555 } 556 case XDP_UMEM_FILL_RING: 557 case XDP_UMEM_COMPLETION_RING: 558 { 559 struct xsk_queue **q; 560 int entries; 561 562 if (copy_from_user(&entries, optval, sizeof(entries))) 563 return -EFAULT; 564 565 mutex_lock(&xs->mutex); 566 if (!xs->umem) { 567 mutex_unlock(&xs->mutex); 568 return -EINVAL; 569 } 570 571 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 572 &xs->umem->cq; 573 err = xsk_init_queue(entries, q, true); 574 mutex_unlock(&xs->mutex); 575 return err; 576 } 577 default: 578 break; 579 } 580 581 return -ENOPROTOOPT; 582 } 583 584 static int xsk_getsockopt(struct socket *sock, int level, int optname, 585 char __user *optval, int __user *optlen) 586 { 587 struct sock *sk = sock->sk; 588 struct xdp_sock *xs = xdp_sk(sk); 589 int len; 590 591 if (level != SOL_XDP) 592 return -ENOPROTOOPT; 593 594 if (get_user(len, optlen)) 595 return -EFAULT; 596 if (len < 0) 597 return -EINVAL; 598 599 switch (optname) { 600 case XDP_STATISTICS: 601 { 602 struct xdp_statistics stats; 603 604 if (len < sizeof(stats)) 605 return -EINVAL; 606 607 mutex_lock(&xs->mutex); 608 stats.rx_dropped = xs->rx_dropped; 609 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 610 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 611 mutex_unlock(&xs->mutex); 612 613 if (copy_to_user(optval, &stats, sizeof(stats))) 614 return -EFAULT; 615 if (put_user(sizeof(stats), optlen)) 616 return -EFAULT; 617 618 return 0; 619 } 620 case XDP_MMAP_OFFSETS: 621 { 622 struct xdp_mmap_offsets off; 623 624 if (len < sizeof(off)) 625 return -EINVAL; 626 627 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 628 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 629 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); 630 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 631 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 632 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); 633 634 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 635 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 636 off.fr.desc = offsetof(struct xdp_umem_ring, desc); 637 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 638 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 639 off.cr.desc = offsetof(struct xdp_umem_ring, desc); 640 641 len = sizeof(off); 642 if (copy_to_user(optval, &off, len)) 643 return -EFAULT; 644 if (put_user(len, optlen)) 645 return -EFAULT; 646 647 return 0; 648 } 649 case XDP_OPTIONS: 650 { 651 struct xdp_options opts = {}; 652 653 if (len < sizeof(opts)) 654 return -EINVAL; 655 656 mutex_lock(&xs->mutex); 657 if (xs->zc) 658 opts.flags |= XDP_OPTIONS_ZEROCOPY; 659 mutex_unlock(&xs->mutex); 660 661 len = sizeof(opts); 662 if (copy_to_user(optval, &opts, len)) 663 return -EFAULT; 664 if (put_user(len, optlen)) 665 return -EFAULT; 666 667 return 0; 668 } 669 default: 670 break; 671 } 672 673 return -EOPNOTSUPP; 674 } 675 676 static int xsk_mmap(struct file *file, struct socket *sock, 677 struct vm_area_struct *vma) 678 { 679 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 680 unsigned long size = vma->vm_end - vma->vm_start; 681 struct xdp_sock *xs = xdp_sk(sock->sk); 682 struct xsk_queue *q = NULL; 683 struct xdp_umem *umem; 684 unsigned long pfn; 685 struct page *qpg; 686 687 if (offset == XDP_PGOFF_RX_RING) { 688 q = READ_ONCE(xs->rx); 689 } else if (offset == XDP_PGOFF_TX_RING) { 690 q = READ_ONCE(xs->tx); 691 } else { 692 umem = READ_ONCE(xs->umem); 693 if (!umem) 694 return -EINVAL; 695 696 /* Matches the smp_wmb() in XDP_UMEM_REG */ 697 smp_rmb(); 698 if (offset == XDP_UMEM_PGOFF_FILL_RING) 699 q = READ_ONCE(umem->fq); 700 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 701 q = READ_ONCE(umem->cq); 702 } 703 704 if (!q) 705 return -EINVAL; 706 707 /* Matches the smp_wmb() in xsk_init_queue */ 708 smp_rmb(); 709 qpg = virt_to_head_page(q->ring); 710 if (size > (PAGE_SIZE << compound_order(qpg))) 711 return -EINVAL; 712 713 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 714 return remap_pfn_range(vma, vma->vm_start, pfn, 715 size, vma->vm_page_prot); 716 } 717 718 static struct proto xsk_proto = { 719 .name = "XDP", 720 .owner = THIS_MODULE, 721 .obj_size = sizeof(struct xdp_sock), 722 }; 723 724 static const struct proto_ops xsk_proto_ops = { 725 .family = PF_XDP, 726 .owner = THIS_MODULE, 727 .release = xsk_release, 728 .bind = xsk_bind, 729 .connect = sock_no_connect, 730 .socketpair = sock_no_socketpair, 731 .accept = sock_no_accept, 732 .getname = sock_no_getname, 733 .poll = xsk_poll, 734 .ioctl = sock_no_ioctl, 735 .listen = sock_no_listen, 736 .shutdown = sock_no_shutdown, 737 .setsockopt = xsk_setsockopt, 738 .getsockopt = xsk_getsockopt, 739 .sendmsg = xsk_sendmsg, 740 .recvmsg = sock_no_recvmsg, 741 .mmap = xsk_mmap, 742 .sendpage = sock_no_sendpage, 743 }; 744 745 static void xsk_destruct(struct sock *sk) 746 { 747 struct xdp_sock *xs = xdp_sk(sk); 748 749 if (!sock_flag(sk, SOCK_DEAD)) 750 return; 751 752 xdp_put_umem(xs->umem); 753 754 sk_refcnt_debug_dec(sk); 755 } 756 757 static int xsk_create(struct net *net, struct socket *sock, int protocol, 758 int kern) 759 { 760 struct sock *sk; 761 struct xdp_sock *xs; 762 763 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 764 return -EPERM; 765 if (sock->type != SOCK_RAW) 766 return -ESOCKTNOSUPPORT; 767 768 if (protocol) 769 return -EPROTONOSUPPORT; 770 771 sock->state = SS_UNCONNECTED; 772 773 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 774 if (!sk) 775 return -ENOBUFS; 776 777 sock->ops = &xsk_proto_ops; 778 779 sock_init_data(sock, sk); 780 781 sk->sk_family = PF_XDP; 782 783 sk->sk_destruct = xsk_destruct; 784 sk_refcnt_debug_inc(sk); 785 786 sock_set_flag(sk, SOCK_RCU_FREE); 787 788 xs = xdp_sk(sk); 789 mutex_init(&xs->mutex); 790 spin_lock_init(&xs->tx_completion_lock); 791 792 mutex_lock(&net->xdp.lock); 793 sk_add_node_rcu(sk, &net->xdp.list); 794 mutex_unlock(&net->xdp.lock); 795 796 local_bh_disable(); 797 sock_prot_inuse_add(net, &xsk_proto, 1); 798 local_bh_enable(); 799 800 return 0; 801 } 802 803 static const struct net_proto_family xsk_family_ops = { 804 .family = PF_XDP, 805 .create = xsk_create, 806 .owner = THIS_MODULE, 807 }; 808 809 static int __net_init xsk_net_init(struct net *net) 810 { 811 mutex_init(&net->xdp.lock); 812 INIT_HLIST_HEAD(&net->xdp.list); 813 return 0; 814 } 815 816 static void __net_exit xsk_net_exit(struct net *net) 817 { 818 WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 819 } 820 821 static struct pernet_operations xsk_net_ops = { 822 .init = xsk_net_init, 823 .exit = xsk_net_exit, 824 }; 825 826 static int __init xsk_init(void) 827 { 828 int err; 829 830 err = proto_register(&xsk_proto, 0 /* no slab */); 831 if (err) 832 goto out; 833 834 err = sock_register(&xsk_family_ops); 835 if (err) 836 goto out_proto; 837 838 err = register_pernet_subsys(&xsk_net_ops); 839 if (err) 840 goto out_sk; 841 return 0; 842 843 out_sk: 844 sock_unregister(PF_XDP); 845 out_proto: 846 proto_unregister(&xsk_proto); 847 out: 848 return err; 849 } 850 851 fs_initcall(xsk_init); 852