xref: /linux/net/xdp/xsk.c (revision bab2c80e5a6c855657482eac9e97f5f3eedb509a)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 
31 #define TX_BATCH_SIZE 16
32 
33 static struct xdp_sock *xdp_sk(struct sock *sk)
34 {
35 	return (struct xdp_sock *)sk;
36 }
37 
38 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
39 {
40 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
41 		READ_ONCE(xs->umem->fq);
42 }
43 
44 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
45 {
46 	return xskq_peek_addr(umem->fq, addr);
47 }
48 EXPORT_SYMBOL(xsk_umem_peek_addr);
49 
50 void xsk_umem_discard_addr(struct xdp_umem *umem)
51 {
52 	xskq_discard_addr(umem->fq);
53 }
54 EXPORT_SYMBOL(xsk_umem_discard_addr);
55 
56 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
57 {
58 	void *buffer;
59 	u64 addr;
60 	int err;
61 
62 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
63 	    len > xs->umem->chunk_size_nohr) {
64 		xs->rx_dropped++;
65 		return -ENOSPC;
66 	}
67 
68 	addr += xs->umem->headroom;
69 
70 	buffer = xdp_umem_get_data(xs->umem, addr);
71 	memcpy(buffer, xdp->data, len);
72 	err = xskq_produce_batch_desc(xs->rx, addr, len);
73 	if (!err) {
74 		xskq_discard_addr(xs->umem->fq);
75 		xdp_return_buff(xdp);
76 		return 0;
77 	}
78 
79 	xs->rx_dropped++;
80 	return err;
81 }
82 
83 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
84 {
85 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
86 
87 	if (err) {
88 		xdp_return_buff(xdp);
89 		xs->rx_dropped++;
90 	}
91 
92 	return err;
93 }
94 
95 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
96 {
97 	u32 len;
98 
99 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
100 		return -EINVAL;
101 
102 	len = xdp->data_end - xdp->data;
103 
104 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
105 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
106 }
107 
108 void xsk_flush(struct xdp_sock *xs)
109 {
110 	xskq_produce_flush_desc(xs->rx);
111 	xs->sk.sk_data_ready(&xs->sk);
112 }
113 
114 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
115 {
116 	u32 len = xdp->data_end - xdp->data;
117 	void *buffer;
118 	u64 addr;
119 	int err;
120 
121 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
122 		return -EINVAL;
123 
124 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
125 	    len > xs->umem->chunk_size_nohr) {
126 		xs->rx_dropped++;
127 		return -ENOSPC;
128 	}
129 
130 	addr += xs->umem->headroom;
131 
132 	buffer = xdp_umem_get_data(xs->umem, addr);
133 	memcpy(buffer, xdp->data, len);
134 	err = xskq_produce_batch_desc(xs->rx, addr, len);
135 	if (!err) {
136 		xskq_discard_addr(xs->umem->fq);
137 		xsk_flush(xs);
138 		return 0;
139 	}
140 
141 	xs->rx_dropped++;
142 	return err;
143 }
144 
145 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
146 {
147 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
148 }
149 EXPORT_SYMBOL(xsk_umem_complete_tx);
150 
151 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
152 {
153 	struct xdp_sock *xs;
154 
155 	rcu_read_lock();
156 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
157 		xs->sk.sk_write_space(&xs->sk);
158 	}
159 	rcu_read_unlock();
160 }
161 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
162 
163 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
164 {
165 	struct xdp_desc desc;
166 	struct xdp_sock *xs;
167 
168 	rcu_read_lock();
169 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
170 		if (!xskq_peek_desc(xs->tx, &desc))
171 			continue;
172 
173 		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
174 			goto out;
175 
176 		*dma = xdp_umem_get_dma(umem, desc.addr);
177 		*len = desc.len;
178 
179 		xskq_discard_desc(xs->tx);
180 		rcu_read_unlock();
181 		return true;
182 	}
183 
184 out:
185 	rcu_read_unlock();
186 	return false;
187 }
188 EXPORT_SYMBOL(xsk_umem_consume_tx);
189 
190 static int xsk_zc_xmit(struct sock *sk)
191 {
192 	struct xdp_sock *xs = xdp_sk(sk);
193 	struct net_device *dev = xs->dev;
194 
195 	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
196 }
197 
198 static void xsk_destruct_skb(struct sk_buff *skb)
199 {
200 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
201 	struct xdp_sock *xs = xdp_sk(skb->sk);
202 	unsigned long flags;
203 
204 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
205 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
206 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
207 
208 	sock_wfree(skb);
209 }
210 
211 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
212 			    size_t total_len)
213 {
214 	u32 max_batch = TX_BATCH_SIZE;
215 	struct xdp_sock *xs = xdp_sk(sk);
216 	bool sent_frame = false;
217 	struct xdp_desc desc;
218 	struct sk_buff *skb;
219 	int err = 0;
220 
221 	if (unlikely(!xs->tx))
222 		return -ENOBUFS;
223 
224 	mutex_lock(&xs->mutex);
225 
226 	while (xskq_peek_desc(xs->tx, &desc)) {
227 		char *buffer;
228 		u64 addr;
229 		u32 len;
230 
231 		if (max_batch-- == 0) {
232 			err = -EAGAIN;
233 			goto out;
234 		}
235 
236 		if (xskq_reserve_addr(xs->umem->cq)) {
237 			err = -EAGAIN;
238 			goto out;
239 		}
240 
241 		len = desc.len;
242 		if (unlikely(len > xs->dev->mtu)) {
243 			err = -EMSGSIZE;
244 			goto out;
245 		}
246 
247 		if (xs->queue_id >= xs->dev->real_num_tx_queues) {
248 			err = -ENXIO;
249 			goto out;
250 		}
251 
252 		skb = sock_alloc_send_skb(sk, len, 1, &err);
253 		if (unlikely(!skb)) {
254 			err = -EAGAIN;
255 			goto out;
256 		}
257 
258 		skb_put(skb, len);
259 		addr = desc.addr;
260 		buffer = xdp_umem_get_data(xs->umem, addr);
261 		err = skb_store_bits(skb, 0, buffer, len);
262 		if (unlikely(err)) {
263 			kfree_skb(skb);
264 			goto out;
265 		}
266 
267 		skb->dev = xs->dev;
268 		skb->priority = sk->sk_priority;
269 		skb->mark = sk->sk_mark;
270 		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
271 		skb->destructor = xsk_destruct_skb;
272 
273 		err = dev_direct_xmit(skb, xs->queue_id);
274 		xskq_discard_desc(xs->tx);
275 		/* Ignore NET_XMIT_CN as packet might have been sent */
276 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
277 			/* SKB completed but not sent */
278 			err = -EBUSY;
279 			goto out;
280 		}
281 
282 		sent_frame = true;
283 	}
284 
285 out:
286 	if (sent_frame)
287 		sk->sk_write_space(sk);
288 
289 	mutex_unlock(&xs->mutex);
290 	return err;
291 }
292 
293 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
294 {
295 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
296 	struct sock *sk = sock->sk;
297 	struct xdp_sock *xs = xdp_sk(sk);
298 
299 	if (unlikely(!xs->dev))
300 		return -ENXIO;
301 	if (unlikely(!(xs->dev->flags & IFF_UP)))
302 		return -ENETDOWN;
303 	if (need_wait)
304 		return -EOPNOTSUPP;
305 
306 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
307 }
308 
309 static unsigned int xsk_poll(struct file *file, struct socket *sock,
310 			     struct poll_table_struct *wait)
311 {
312 	unsigned int mask = datagram_poll(file, sock, wait);
313 	struct sock *sk = sock->sk;
314 	struct xdp_sock *xs = xdp_sk(sk);
315 
316 	if (xs->rx && !xskq_empty_desc(xs->rx))
317 		mask |= POLLIN | POLLRDNORM;
318 	if (xs->tx && !xskq_full_desc(xs->tx))
319 		mask |= POLLOUT | POLLWRNORM;
320 
321 	return mask;
322 }
323 
324 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
325 			  bool umem_queue)
326 {
327 	struct xsk_queue *q;
328 
329 	if (entries == 0 || *queue || !is_power_of_2(entries))
330 		return -EINVAL;
331 
332 	q = xskq_create(entries, umem_queue);
333 	if (!q)
334 		return -ENOMEM;
335 
336 	/* Make sure queue is ready before it can be seen by others */
337 	smp_wmb();
338 	*queue = q;
339 	return 0;
340 }
341 
342 static int xsk_release(struct socket *sock)
343 {
344 	struct sock *sk = sock->sk;
345 	struct xdp_sock *xs = xdp_sk(sk);
346 	struct net *net;
347 
348 	if (!sk)
349 		return 0;
350 
351 	net = sock_net(sk);
352 
353 	local_bh_disable();
354 	sock_prot_inuse_add(net, sk->sk_prot, -1);
355 	local_bh_enable();
356 
357 	if (xs->dev) {
358 		/* Wait for driver to stop using the xdp socket. */
359 		synchronize_net();
360 		dev_put(xs->dev);
361 		xs->dev = NULL;
362 	}
363 
364 	sock_orphan(sk);
365 	sock->sk = NULL;
366 
367 	sk_refcnt_debug_release(sk);
368 	sock_put(sk);
369 
370 	return 0;
371 }
372 
373 static struct socket *xsk_lookup_xsk_from_fd(int fd)
374 {
375 	struct socket *sock;
376 	int err;
377 
378 	sock = sockfd_lookup(fd, &err);
379 	if (!sock)
380 		return ERR_PTR(-ENOTSOCK);
381 
382 	if (sock->sk->sk_family != PF_XDP) {
383 		sockfd_put(sock);
384 		return ERR_PTR(-ENOPROTOOPT);
385 	}
386 
387 	return sock;
388 }
389 
390 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
391 {
392 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
393 	struct sock *sk = sock->sk;
394 	struct xdp_sock *xs = xdp_sk(sk);
395 	struct net_device *dev;
396 	u32 flags, qid;
397 	int err = 0;
398 
399 	if (addr_len < sizeof(struct sockaddr_xdp))
400 		return -EINVAL;
401 	if (sxdp->sxdp_family != AF_XDP)
402 		return -EINVAL;
403 
404 	mutex_lock(&xs->mutex);
405 	if (xs->dev) {
406 		err = -EBUSY;
407 		goto out_release;
408 	}
409 
410 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
411 	if (!dev) {
412 		err = -ENODEV;
413 		goto out_release;
414 	}
415 
416 	if (!xs->rx && !xs->tx) {
417 		err = -EINVAL;
418 		goto out_unlock;
419 	}
420 
421 	qid = sxdp->sxdp_queue_id;
422 
423 	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
424 	    (xs->tx && qid >= dev->real_num_tx_queues)) {
425 		err = -EINVAL;
426 		goto out_unlock;
427 	}
428 
429 	flags = sxdp->sxdp_flags;
430 
431 	if (flags & XDP_SHARED_UMEM) {
432 		struct xdp_sock *umem_xs;
433 		struct socket *sock;
434 
435 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
436 			/* Cannot specify flags for shared sockets. */
437 			err = -EINVAL;
438 			goto out_unlock;
439 		}
440 
441 		if (xs->umem) {
442 			/* We have already our own. */
443 			err = -EINVAL;
444 			goto out_unlock;
445 		}
446 
447 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
448 		if (IS_ERR(sock)) {
449 			err = PTR_ERR(sock);
450 			goto out_unlock;
451 		}
452 
453 		umem_xs = xdp_sk(sock->sk);
454 		if (!umem_xs->umem) {
455 			/* No umem to inherit. */
456 			err = -EBADF;
457 			sockfd_put(sock);
458 			goto out_unlock;
459 		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
460 			err = -EINVAL;
461 			sockfd_put(sock);
462 			goto out_unlock;
463 		}
464 
465 		xdp_get_umem(umem_xs->umem);
466 		xs->umem = umem_xs->umem;
467 		sockfd_put(sock);
468 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
469 		err = -EINVAL;
470 		goto out_unlock;
471 	} else {
472 		/* This xsk has its own umem. */
473 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
474 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
475 
476 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
477 		if (err)
478 			goto out_unlock;
479 	}
480 
481 	xs->dev = dev;
482 	xs->zc = xs->umem->zc;
483 	xs->queue_id = qid;
484 	xskq_set_umem(xs->rx, &xs->umem->props);
485 	xskq_set_umem(xs->tx, &xs->umem->props);
486 	xdp_add_sk_umem(xs->umem, xs);
487 
488 out_unlock:
489 	if (err)
490 		dev_put(dev);
491 out_release:
492 	mutex_unlock(&xs->mutex);
493 	return err;
494 }
495 
496 static int xsk_setsockopt(struct socket *sock, int level, int optname,
497 			  char __user *optval, unsigned int optlen)
498 {
499 	struct sock *sk = sock->sk;
500 	struct xdp_sock *xs = xdp_sk(sk);
501 	int err;
502 
503 	if (level != SOL_XDP)
504 		return -ENOPROTOOPT;
505 
506 	switch (optname) {
507 	case XDP_RX_RING:
508 	case XDP_TX_RING:
509 	{
510 		struct xsk_queue **q;
511 		int entries;
512 
513 		if (optlen < sizeof(entries))
514 			return -EINVAL;
515 		if (copy_from_user(&entries, optval, sizeof(entries)))
516 			return -EFAULT;
517 
518 		mutex_lock(&xs->mutex);
519 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
520 		err = xsk_init_queue(entries, q, false);
521 		mutex_unlock(&xs->mutex);
522 		return err;
523 	}
524 	case XDP_UMEM_REG:
525 	{
526 		struct xdp_umem_reg mr;
527 		struct xdp_umem *umem;
528 
529 		if (copy_from_user(&mr, optval, sizeof(mr)))
530 			return -EFAULT;
531 
532 		mutex_lock(&xs->mutex);
533 		if (xs->umem) {
534 			mutex_unlock(&xs->mutex);
535 			return -EBUSY;
536 		}
537 
538 		umem = xdp_umem_create(&mr);
539 		if (IS_ERR(umem)) {
540 			mutex_unlock(&xs->mutex);
541 			return PTR_ERR(umem);
542 		}
543 
544 		/* Make sure umem is ready before it can be seen by others */
545 		smp_wmb();
546 		xs->umem = umem;
547 		mutex_unlock(&xs->mutex);
548 		return 0;
549 	}
550 	case XDP_UMEM_FILL_RING:
551 	case XDP_UMEM_COMPLETION_RING:
552 	{
553 		struct xsk_queue **q;
554 		int entries;
555 
556 		if (copy_from_user(&entries, optval, sizeof(entries)))
557 			return -EFAULT;
558 
559 		mutex_lock(&xs->mutex);
560 		if (!xs->umem) {
561 			mutex_unlock(&xs->mutex);
562 			return -EINVAL;
563 		}
564 
565 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
566 			&xs->umem->cq;
567 		err = xsk_init_queue(entries, q, true);
568 		mutex_unlock(&xs->mutex);
569 		return err;
570 	}
571 	default:
572 		break;
573 	}
574 
575 	return -ENOPROTOOPT;
576 }
577 
578 static int xsk_getsockopt(struct socket *sock, int level, int optname,
579 			  char __user *optval, int __user *optlen)
580 {
581 	struct sock *sk = sock->sk;
582 	struct xdp_sock *xs = xdp_sk(sk);
583 	int len;
584 
585 	if (level != SOL_XDP)
586 		return -ENOPROTOOPT;
587 
588 	if (get_user(len, optlen))
589 		return -EFAULT;
590 	if (len < 0)
591 		return -EINVAL;
592 
593 	switch (optname) {
594 	case XDP_STATISTICS:
595 	{
596 		struct xdp_statistics stats;
597 
598 		if (len < sizeof(stats))
599 			return -EINVAL;
600 
601 		mutex_lock(&xs->mutex);
602 		stats.rx_dropped = xs->rx_dropped;
603 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
604 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
605 		mutex_unlock(&xs->mutex);
606 
607 		if (copy_to_user(optval, &stats, sizeof(stats)))
608 			return -EFAULT;
609 		if (put_user(sizeof(stats), optlen))
610 			return -EFAULT;
611 
612 		return 0;
613 	}
614 	case XDP_MMAP_OFFSETS:
615 	{
616 		struct xdp_mmap_offsets off;
617 
618 		if (len < sizeof(off))
619 			return -EINVAL;
620 
621 		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
622 		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
623 		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
624 		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
625 		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
626 		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
627 
628 		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
629 		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
630 		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
631 		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
632 		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
633 		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
634 
635 		len = sizeof(off);
636 		if (copy_to_user(optval, &off, len))
637 			return -EFAULT;
638 		if (put_user(len, optlen))
639 			return -EFAULT;
640 
641 		return 0;
642 	}
643 	default:
644 		break;
645 	}
646 
647 	return -EOPNOTSUPP;
648 }
649 
650 static int xsk_mmap(struct file *file, struct socket *sock,
651 		    struct vm_area_struct *vma)
652 {
653 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
654 	unsigned long size = vma->vm_end - vma->vm_start;
655 	struct xdp_sock *xs = xdp_sk(sock->sk);
656 	struct xsk_queue *q = NULL;
657 	struct xdp_umem *umem;
658 	unsigned long pfn;
659 	struct page *qpg;
660 
661 	if (offset == XDP_PGOFF_RX_RING) {
662 		q = READ_ONCE(xs->rx);
663 	} else if (offset == XDP_PGOFF_TX_RING) {
664 		q = READ_ONCE(xs->tx);
665 	} else {
666 		umem = READ_ONCE(xs->umem);
667 		if (!umem)
668 			return -EINVAL;
669 
670 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
671 			q = READ_ONCE(umem->fq);
672 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
673 			q = READ_ONCE(umem->cq);
674 	}
675 
676 	if (!q)
677 		return -EINVAL;
678 
679 	qpg = virt_to_head_page(q->ring);
680 	if (size > (PAGE_SIZE << compound_order(qpg)))
681 		return -EINVAL;
682 
683 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
684 	return remap_pfn_range(vma, vma->vm_start, pfn,
685 			       size, vma->vm_page_prot);
686 }
687 
688 static struct proto xsk_proto = {
689 	.name =		"XDP",
690 	.owner =	THIS_MODULE,
691 	.obj_size =	sizeof(struct xdp_sock),
692 };
693 
694 static const struct proto_ops xsk_proto_ops = {
695 	.family		= PF_XDP,
696 	.owner		= THIS_MODULE,
697 	.release	= xsk_release,
698 	.bind		= xsk_bind,
699 	.connect	= sock_no_connect,
700 	.socketpair	= sock_no_socketpair,
701 	.accept		= sock_no_accept,
702 	.getname	= sock_no_getname,
703 	.poll		= xsk_poll,
704 	.ioctl		= sock_no_ioctl,
705 	.listen		= sock_no_listen,
706 	.shutdown	= sock_no_shutdown,
707 	.setsockopt	= xsk_setsockopt,
708 	.getsockopt	= xsk_getsockopt,
709 	.sendmsg	= xsk_sendmsg,
710 	.recvmsg	= sock_no_recvmsg,
711 	.mmap		= xsk_mmap,
712 	.sendpage	= sock_no_sendpage,
713 };
714 
715 static void xsk_destruct(struct sock *sk)
716 {
717 	struct xdp_sock *xs = xdp_sk(sk);
718 
719 	if (!sock_flag(sk, SOCK_DEAD))
720 		return;
721 
722 	xskq_destroy(xs->rx);
723 	xskq_destroy(xs->tx);
724 	xdp_del_sk_umem(xs->umem, xs);
725 	xdp_put_umem(xs->umem);
726 
727 	sk_refcnt_debug_dec(sk);
728 }
729 
730 static int xsk_create(struct net *net, struct socket *sock, int protocol,
731 		      int kern)
732 {
733 	struct sock *sk;
734 	struct xdp_sock *xs;
735 
736 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
737 		return -EPERM;
738 	if (sock->type != SOCK_RAW)
739 		return -ESOCKTNOSUPPORT;
740 
741 	if (protocol)
742 		return -EPROTONOSUPPORT;
743 
744 	sock->state = SS_UNCONNECTED;
745 
746 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
747 	if (!sk)
748 		return -ENOBUFS;
749 
750 	sock->ops = &xsk_proto_ops;
751 
752 	sock_init_data(sock, sk);
753 
754 	sk->sk_family = PF_XDP;
755 
756 	sk->sk_destruct = xsk_destruct;
757 	sk_refcnt_debug_inc(sk);
758 
759 	xs = xdp_sk(sk);
760 	mutex_init(&xs->mutex);
761 	spin_lock_init(&xs->tx_completion_lock);
762 
763 	local_bh_disable();
764 	sock_prot_inuse_add(net, &xsk_proto, 1);
765 	local_bh_enable();
766 
767 	return 0;
768 }
769 
770 static const struct net_proto_family xsk_family_ops = {
771 	.family = PF_XDP,
772 	.create = xsk_create,
773 	.owner	= THIS_MODULE,
774 };
775 
776 static int __init xsk_init(void)
777 {
778 	int err;
779 
780 	err = proto_register(&xsk_proto, 0 /* no slab */);
781 	if (err)
782 		goto out;
783 
784 	err = sock_register(&xsk_family_ops);
785 	if (err)
786 		goto out_proto;
787 
788 	return 0;
789 
790 out_proto:
791 	proto_unregister(&xsk_proto);
792 out:
793 	return err;
794 }
795 
796 fs_initcall(xsk_init);
797