xref: /linux/net/xdp/xsk.c (revision 4d374ba0bf30a2a372167ee4b7cdd527e7b47b3b)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
35 {
36 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
37 		READ_ONCE(xs->umem->fq);
38 }
39 
40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
41 {
42 	return xskq_has_addrs(umem->fq, cnt);
43 }
44 EXPORT_SYMBOL(xsk_umem_has_addrs);
45 
46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
47 {
48 	return xskq_peek_addr(umem->fq, addr);
49 }
50 EXPORT_SYMBOL(xsk_umem_peek_addr);
51 
52 void xsk_umem_discard_addr(struct xdp_umem *umem)
53 {
54 	xskq_discard_addr(umem->fq);
55 }
56 EXPORT_SYMBOL(xsk_umem_discard_addr);
57 
58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
59 {
60 	if (umem->need_wakeup & XDP_WAKEUP_RX)
61 		return;
62 
63 	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
64 	umem->need_wakeup |= XDP_WAKEUP_RX;
65 }
66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
67 
68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
69 {
70 	struct xdp_sock *xs;
71 
72 	if (umem->need_wakeup & XDP_WAKEUP_TX)
73 		return;
74 
75 	rcu_read_lock();
76 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
77 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
78 	}
79 	rcu_read_unlock();
80 
81 	umem->need_wakeup |= XDP_WAKEUP_TX;
82 }
83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
84 
85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
86 {
87 	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
88 		return;
89 
90 	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
91 	umem->need_wakeup &= ~XDP_WAKEUP_RX;
92 }
93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
94 
95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
96 {
97 	struct xdp_sock *xs;
98 
99 	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
100 		return;
101 
102 	rcu_read_lock();
103 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
104 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
105 	}
106 	rcu_read_unlock();
107 
108 	umem->need_wakeup &= ~XDP_WAKEUP_TX;
109 }
110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
111 
112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
113 {
114 	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
115 }
116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
117 
118 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
119 {
120 	void *to_buf, *from_buf;
121 	u32 metalen;
122 	u64 addr;
123 	int err;
124 
125 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
126 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
127 		xs->rx_dropped++;
128 		return -ENOSPC;
129 	}
130 
131 	addr += xs->umem->headroom;
132 
133 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
134 		from_buf = xdp->data;
135 		metalen = 0;
136 	} else {
137 		from_buf = xdp->data_meta;
138 		metalen = xdp->data - xdp->data_meta;
139 	}
140 
141 	to_buf = xdp_umem_get_data(xs->umem, addr);
142 	memcpy(to_buf, from_buf, len + metalen);
143 	addr += metalen;
144 	err = xskq_produce_batch_desc(xs->rx, addr, len);
145 	if (!err) {
146 		xskq_discard_addr(xs->umem->fq);
147 		xdp_return_buff(xdp);
148 		return 0;
149 	}
150 
151 	xs->rx_dropped++;
152 	return err;
153 }
154 
155 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
156 {
157 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
158 
159 	if (err)
160 		xs->rx_dropped++;
161 
162 	return err;
163 }
164 
165 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
166 {
167 	u32 len;
168 
169 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
170 		return -EINVAL;
171 
172 	len = xdp->data_end - xdp->data;
173 
174 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
175 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
176 }
177 
178 void xsk_flush(struct xdp_sock *xs)
179 {
180 	xskq_produce_flush_desc(xs->rx);
181 	xs->sk.sk_data_ready(&xs->sk);
182 }
183 
184 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
185 {
186 	u32 metalen = xdp->data - xdp->data_meta;
187 	u32 len = xdp->data_end - xdp->data;
188 	void *buffer;
189 	u64 addr;
190 	int err;
191 
192 	spin_lock_bh(&xs->rx_lock);
193 
194 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
195 		err = -EINVAL;
196 		goto out_unlock;
197 	}
198 
199 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
200 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
201 		err = -ENOSPC;
202 		goto out_drop;
203 	}
204 
205 	addr += xs->umem->headroom;
206 
207 	buffer = xdp_umem_get_data(xs->umem, addr);
208 	memcpy(buffer, xdp->data_meta, len + metalen);
209 	addr += metalen;
210 	err = xskq_produce_batch_desc(xs->rx, addr, len);
211 	if (err)
212 		goto out_drop;
213 
214 	xskq_discard_addr(xs->umem->fq);
215 	xskq_produce_flush_desc(xs->rx);
216 
217 	spin_unlock_bh(&xs->rx_lock);
218 
219 	xs->sk.sk_data_ready(&xs->sk);
220 	return 0;
221 
222 out_drop:
223 	xs->rx_dropped++;
224 out_unlock:
225 	spin_unlock_bh(&xs->rx_lock);
226 	return err;
227 }
228 
229 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
230 {
231 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
232 }
233 EXPORT_SYMBOL(xsk_umem_complete_tx);
234 
235 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
236 {
237 	struct xdp_sock *xs;
238 
239 	rcu_read_lock();
240 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
241 		xs->sk.sk_write_space(&xs->sk);
242 	}
243 	rcu_read_unlock();
244 }
245 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
246 
247 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
248 {
249 	struct xdp_sock *xs;
250 
251 	rcu_read_lock();
252 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
253 		if (!xskq_peek_desc(xs->tx, desc))
254 			continue;
255 
256 		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
257 			goto out;
258 
259 		xskq_discard_desc(xs->tx);
260 		rcu_read_unlock();
261 		return true;
262 	}
263 
264 out:
265 	rcu_read_unlock();
266 	return false;
267 }
268 EXPORT_SYMBOL(xsk_umem_consume_tx);
269 
270 static int xsk_zc_xmit(struct sock *sk)
271 {
272 	struct xdp_sock *xs = xdp_sk(sk);
273 	struct net_device *dev = xs->dev;
274 
275 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
276 					       XDP_WAKEUP_TX);
277 }
278 
279 static void xsk_destruct_skb(struct sk_buff *skb)
280 {
281 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
282 	struct xdp_sock *xs = xdp_sk(skb->sk);
283 	unsigned long flags;
284 
285 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
286 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
287 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
288 
289 	sock_wfree(skb);
290 }
291 
292 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
293 			    size_t total_len)
294 {
295 	u32 max_batch = TX_BATCH_SIZE;
296 	struct xdp_sock *xs = xdp_sk(sk);
297 	bool sent_frame = false;
298 	struct xdp_desc desc;
299 	struct sk_buff *skb;
300 	int err = 0;
301 
302 	mutex_lock(&xs->mutex);
303 
304 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
305 		goto out;
306 
307 	while (xskq_peek_desc(xs->tx, &desc)) {
308 		char *buffer;
309 		u64 addr;
310 		u32 len;
311 
312 		if (max_batch-- == 0) {
313 			err = -EAGAIN;
314 			goto out;
315 		}
316 
317 		len = desc.len;
318 		skb = sock_alloc_send_skb(sk, len, 1, &err);
319 		if (unlikely(!skb)) {
320 			err = -EAGAIN;
321 			goto out;
322 		}
323 
324 		skb_put(skb, len);
325 		addr = desc.addr;
326 		buffer = xdp_umem_get_data(xs->umem, addr);
327 		err = skb_store_bits(skb, 0, buffer, len);
328 		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
329 			kfree_skb(skb);
330 			goto out;
331 		}
332 
333 		skb->dev = xs->dev;
334 		skb->priority = sk->sk_priority;
335 		skb->mark = sk->sk_mark;
336 		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
337 		skb->destructor = xsk_destruct_skb;
338 
339 		err = dev_direct_xmit(skb, xs->queue_id);
340 		xskq_discard_desc(xs->tx);
341 		/* Ignore NET_XMIT_CN as packet might have been sent */
342 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
343 			/* SKB completed but not sent */
344 			err = -EBUSY;
345 			goto out;
346 		}
347 
348 		sent_frame = true;
349 	}
350 
351 out:
352 	if (sent_frame)
353 		sk->sk_write_space(sk);
354 
355 	mutex_unlock(&xs->mutex);
356 	return err;
357 }
358 
359 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
360 {
361 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
362 	struct sock *sk = sock->sk;
363 	struct xdp_sock *xs = xdp_sk(sk);
364 
365 	if (unlikely(!xs->dev))
366 		return -ENXIO;
367 	if (unlikely(!(xs->dev->flags & IFF_UP)))
368 		return -ENETDOWN;
369 	if (unlikely(!xs->tx))
370 		return -ENOBUFS;
371 	if (need_wait)
372 		return -EOPNOTSUPP;
373 
374 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
375 }
376 
377 static unsigned int xsk_poll(struct file *file, struct socket *sock,
378 			     struct poll_table_struct *wait)
379 {
380 	unsigned int mask = datagram_poll(file, sock, wait);
381 	struct sock *sk = sock->sk;
382 	struct xdp_sock *xs = xdp_sk(sk);
383 	struct net_device *dev = xs->dev;
384 	struct xdp_umem *umem = xs->umem;
385 
386 	if (umem->need_wakeup)
387 		dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
388 						umem->need_wakeup);
389 
390 	if (xs->rx && !xskq_empty_desc(xs->rx))
391 		mask |= POLLIN | POLLRDNORM;
392 	if (xs->tx && !xskq_full_desc(xs->tx))
393 		mask |= POLLOUT | POLLWRNORM;
394 
395 	return mask;
396 }
397 
398 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
399 			  bool umem_queue)
400 {
401 	struct xsk_queue *q;
402 
403 	if (entries == 0 || *queue || !is_power_of_2(entries))
404 		return -EINVAL;
405 
406 	q = xskq_create(entries, umem_queue);
407 	if (!q)
408 		return -ENOMEM;
409 
410 	/* Make sure queue is ready before it can be seen by others */
411 	smp_wmb();
412 	*queue = q;
413 	return 0;
414 }
415 
416 static void xsk_unbind_dev(struct xdp_sock *xs)
417 {
418 	struct net_device *dev = xs->dev;
419 
420 	if (!dev || xs->state != XSK_BOUND)
421 		return;
422 
423 	xs->state = XSK_UNBOUND;
424 
425 	/* Wait for driver to stop using the xdp socket. */
426 	xdp_del_sk_umem(xs->umem, xs);
427 	xs->dev = NULL;
428 	synchronize_net();
429 	dev_put(dev);
430 }
431 
432 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
433 					      struct xdp_sock ***map_entry)
434 {
435 	struct xsk_map *map = NULL;
436 	struct xsk_map_node *node;
437 
438 	*map_entry = NULL;
439 
440 	spin_lock_bh(&xs->map_list_lock);
441 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
442 					node);
443 	if (node) {
444 		WARN_ON(xsk_map_inc(node->map));
445 		map = node->map;
446 		*map_entry = node->map_entry;
447 	}
448 	spin_unlock_bh(&xs->map_list_lock);
449 	return map;
450 }
451 
452 static void xsk_delete_from_maps(struct xdp_sock *xs)
453 {
454 	/* This function removes the current XDP socket from all the
455 	 * maps it resides in. We need to take extra care here, due to
456 	 * the two locks involved. Each map has a lock synchronizing
457 	 * updates to the entries, and each socket has a lock that
458 	 * synchronizes access to the list of maps (map_list). For
459 	 * deadlock avoidance the locks need to be taken in the order
460 	 * "map lock"->"socket map list lock". We start off by
461 	 * accessing the socket map list, and take a reference to the
462 	 * map to guarantee existence between the
463 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
464 	 * calls. Then we ask the map to remove the socket, which
465 	 * tries to remove the socket from the map. Note that there
466 	 * might be updates to the map between
467 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
468 	 */
469 	struct xdp_sock **map_entry = NULL;
470 	struct xsk_map *map;
471 
472 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
473 		xsk_map_try_sock_delete(map, xs, map_entry);
474 		xsk_map_put(map);
475 	}
476 }
477 
478 static int xsk_release(struct socket *sock)
479 {
480 	struct sock *sk = sock->sk;
481 	struct xdp_sock *xs = xdp_sk(sk);
482 	struct net *net;
483 
484 	if (!sk)
485 		return 0;
486 
487 	net = sock_net(sk);
488 
489 	mutex_lock(&net->xdp.lock);
490 	sk_del_node_init_rcu(sk);
491 	mutex_unlock(&net->xdp.lock);
492 
493 	local_bh_disable();
494 	sock_prot_inuse_add(net, sk->sk_prot, -1);
495 	local_bh_enable();
496 
497 	xsk_delete_from_maps(xs);
498 	xsk_unbind_dev(xs);
499 
500 	xskq_destroy(xs->rx);
501 	xskq_destroy(xs->tx);
502 
503 	sock_orphan(sk);
504 	sock->sk = NULL;
505 
506 	sk_refcnt_debug_release(sk);
507 	sock_put(sk);
508 
509 	return 0;
510 }
511 
512 static struct socket *xsk_lookup_xsk_from_fd(int fd)
513 {
514 	struct socket *sock;
515 	int err;
516 
517 	sock = sockfd_lookup(fd, &err);
518 	if (!sock)
519 		return ERR_PTR(-ENOTSOCK);
520 
521 	if (sock->sk->sk_family != PF_XDP) {
522 		sockfd_put(sock);
523 		return ERR_PTR(-ENOPROTOOPT);
524 	}
525 
526 	return sock;
527 }
528 
529 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
530 {
531 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
532 	struct sock *sk = sock->sk;
533 	struct xdp_sock *xs = xdp_sk(sk);
534 	struct net_device *dev;
535 	u32 flags, qid;
536 	int err = 0;
537 
538 	if (addr_len < sizeof(struct sockaddr_xdp))
539 		return -EINVAL;
540 	if (sxdp->sxdp_family != AF_XDP)
541 		return -EINVAL;
542 
543 	flags = sxdp->sxdp_flags;
544 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
545 		      XDP_USE_NEED_WAKEUP))
546 		return -EINVAL;
547 
548 	rtnl_lock();
549 	mutex_lock(&xs->mutex);
550 	if (xs->state != XSK_READY) {
551 		err = -EBUSY;
552 		goto out_release;
553 	}
554 
555 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
556 	if (!dev) {
557 		err = -ENODEV;
558 		goto out_release;
559 	}
560 
561 	if (!xs->rx && !xs->tx) {
562 		err = -EINVAL;
563 		goto out_unlock;
564 	}
565 
566 	qid = sxdp->sxdp_queue_id;
567 
568 	if (flags & XDP_SHARED_UMEM) {
569 		struct xdp_sock *umem_xs;
570 		struct socket *sock;
571 
572 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
573 		    (flags & XDP_USE_NEED_WAKEUP)) {
574 			/* Cannot specify flags for shared sockets. */
575 			err = -EINVAL;
576 			goto out_unlock;
577 		}
578 
579 		if (xs->umem) {
580 			/* We have already our own. */
581 			err = -EINVAL;
582 			goto out_unlock;
583 		}
584 
585 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
586 		if (IS_ERR(sock)) {
587 			err = PTR_ERR(sock);
588 			goto out_unlock;
589 		}
590 
591 		umem_xs = xdp_sk(sock->sk);
592 		if (!umem_xs->umem) {
593 			/* No umem to inherit. */
594 			err = -EBADF;
595 			sockfd_put(sock);
596 			goto out_unlock;
597 		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
598 			err = -EINVAL;
599 			sockfd_put(sock);
600 			goto out_unlock;
601 		}
602 
603 		xdp_get_umem(umem_xs->umem);
604 		xs->umem = umem_xs->umem;
605 		sockfd_put(sock);
606 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
607 		err = -EINVAL;
608 		goto out_unlock;
609 	} else {
610 		/* This xsk has its own umem. */
611 		xskq_set_umem(xs->umem->fq, xs->umem->size,
612 			      xs->umem->chunk_mask);
613 		xskq_set_umem(xs->umem->cq, xs->umem->size,
614 			      xs->umem->chunk_mask);
615 
616 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
617 		if (err)
618 			goto out_unlock;
619 	}
620 
621 	xs->dev = dev;
622 	xs->zc = xs->umem->zc;
623 	xs->queue_id = qid;
624 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
625 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
626 	xdp_add_sk_umem(xs->umem, xs);
627 
628 out_unlock:
629 	if (err)
630 		dev_put(dev);
631 	else
632 		xs->state = XSK_BOUND;
633 out_release:
634 	mutex_unlock(&xs->mutex);
635 	rtnl_unlock();
636 	return err;
637 }
638 
639 static int xsk_setsockopt(struct socket *sock, int level, int optname,
640 			  char __user *optval, unsigned int optlen)
641 {
642 	struct sock *sk = sock->sk;
643 	struct xdp_sock *xs = xdp_sk(sk);
644 	int err;
645 
646 	if (level != SOL_XDP)
647 		return -ENOPROTOOPT;
648 
649 	switch (optname) {
650 	case XDP_RX_RING:
651 	case XDP_TX_RING:
652 	{
653 		struct xsk_queue **q;
654 		int entries;
655 
656 		if (optlen < sizeof(entries))
657 			return -EINVAL;
658 		if (copy_from_user(&entries, optval, sizeof(entries)))
659 			return -EFAULT;
660 
661 		mutex_lock(&xs->mutex);
662 		if (xs->state != XSK_READY) {
663 			mutex_unlock(&xs->mutex);
664 			return -EBUSY;
665 		}
666 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
667 		err = xsk_init_queue(entries, q, false);
668 		if (!err && optname == XDP_TX_RING)
669 			/* Tx needs to be explicitly woken up the first time */
670 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
671 		mutex_unlock(&xs->mutex);
672 		return err;
673 	}
674 	case XDP_UMEM_REG:
675 	{
676 		struct xdp_umem_reg mr;
677 		struct xdp_umem *umem;
678 
679 		if (copy_from_user(&mr, optval, sizeof(mr)))
680 			return -EFAULT;
681 
682 		mutex_lock(&xs->mutex);
683 		if (xs->state != XSK_READY || xs->umem) {
684 			mutex_unlock(&xs->mutex);
685 			return -EBUSY;
686 		}
687 
688 		umem = xdp_umem_create(&mr);
689 		if (IS_ERR(umem)) {
690 			mutex_unlock(&xs->mutex);
691 			return PTR_ERR(umem);
692 		}
693 
694 		/* Make sure umem is ready before it can be seen by others */
695 		smp_wmb();
696 		xs->umem = umem;
697 		mutex_unlock(&xs->mutex);
698 		return 0;
699 	}
700 	case XDP_UMEM_FILL_RING:
701 	case XDP_UMEM_COMPLETION_RING:
702 	{
703 		struct xsk_queue **q;
704 		int entries;
705 
706 		if (copy_from_user(&entries, optval, sizeof(entries)))
707 			return -EFAULT;
708 
709 		mutex_lock(&xs->mutex);
710 		if (xs->state != XSK_READY) {
711 			mutex_unlock(&xs->mutex);
712 			return -EBUSY;
713 		}
714 		if (!xs->umem) {
715 			mutex_unlock(&xs->mutex);
716 			return -EINVAL;
717 		}
718 
719 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
720 			&xs->umem->cq;
721 		err = xsk_init_queue(entries, q, true);
722 		mutex_unlock(&xs->mutex);
723 		return err;
724 	}
725 	default:
726 		break;
727 	}
728 
729 	return -ENOPROTOOPT;
730 }
731 
732 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
733 {
734 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
735 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
736 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
737 }
738 
739 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
740 {
741 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
742 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
743 	ring->desc = offsetof(struct xdp_umem_ring, desc);
744 }
745 
746 static int xsk_getsockopt(struct socket *sock, int level, int optname,
747 			  char __user *optval, int __user *optlen)
748 {
749 	struct sock *sk = sock->sk;
750 	struct xdp_sock *xs = xdp_sk(sk);
751 	int len;
752 
753 	if (level != SOL_XDP)
754 		return -ENOPROTOOPT;
755 
756 	if (get_user(len, optlen))
757 		return -EFAULT;
758 	if (len < 0)
759 		return -EINVAL;
760 
761 	switch (optname) {
762 	case XDP_STATISTICS:
763 	{
764 		struct xdp_statistics stats;
765 
766 		if (len < sizeof(stats))
767 			return -EINVAL;
768 
769 		mutex_lock(&xs->mutex);
770 		stats.rx_dropped = xs->rx_dropped;
771 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
772 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
773 		mutex_unlock(&xs->mutex);
774 
775 		if (copy_to_user(optval, &stats, sizeof(stats)))
776 			return -EFAULT;
777 		if (put_user(sizeof(stats), optlen))
778 			return -EFAULT;
779 
780 		return 0;
781 	}
782 	case XDP_MMAP_OFFSETS:
783 	{
784 		struct xdp_mmap_offsets off;
785 		struct xdp_mmap_offsets_v1 off_v1;
786 		bool flags_supported = true;
787 		void *to_copy;
788 
789 		if (len < sizeof(off_v1))
790 			return -EINVAL;
791 		else if (len < sizeof(off))
792 			flags_supported = false;
793 
794 		if (flags_supported) {
795 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
796 			 * except for the flags field added to the end.
797 			 */
798 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
799 					       &off.rx);
800 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
801 					       &off.tx);
802 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
803 					       &off.fr);
804 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
805 					       &off.cr);
806 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
807 						ptrs.flags);
808 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
809 						ptrs.flags);
810 			off.fr.flags = offsetof(struct xdp_umem_ring,
811 						ptrs.flags);
812 			off.cr.flags = offsetof(struct xdp_umem_ring,
813 						ptrs.flags);
814 
815 			len = sizeof(off);
816 			to_copy = &off;
817 		} else {
818 			xsk_enter_rxtx_offsets(&off_v1.rx);
819 			xsk_enter_rxtx_offsets(&off_v1.tx);
820 			xsk_enter_umem_offsets(&off_v1.fr);
821 			xsk_enter_umem_offsets(&off_v1.cr);
822 
823 			len = sizeof(off_v1);
824 			to_copy = &off_v1;
825 		}
826 
827 		if (copy_to_user(optval, to_copy, len))
828 			return -EFAULT;
829 		if (put_user(len, optlen))
830 			return -EFAULT;
831 
832 		return 0;
833 	}
834 	case XDP_OPTIONS:
835 	{
836 		struct xdp_options opts = {};
837 
838 		if (len < sizeof(opts))
839 			return -EINVAL;
840 
841 		mutex_lock(&xs->mutex);
842 		if (xs->zc)
843 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
844 		mutex_unlock(&xs->mutex);
845 
846 		len = sizeof(opts);
847 		if (copy_to_user(optval, &opts, len))
848 			return -EFAULT;
849 		if (put_user(len, optlen))
850 			return -EFAULT;
851 
852 		return 0;
853 	}
854 	default:
855 		break;
856 	}
857 
858 	return -EOPNOTSUPP;
859 }
860 
861 static int xsk_mmap(struct file *file, struct socket *sock,
862 		    struct vm_area_struct *vma)
863 {
864 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
865 	unsigned long size = vma->vm_end - vma->vm_start;
866 	struct xdp_sock *xs = xdp_sk(sock->sk);
867 	struct xsk_queue *q = NULL;
868 	struct xdp_umem *umem;
869 	unsigned long pfn;
870 	struct page *qpg;
871 
872 	if (xs->state != XSK_READY)
873 		return -EBUSY;
874 
875 	if (offset == XDP_PGOFF_RX_RING) {
876 		q = READ_ONCE(xs->rx);
877 	} else if (offset == XDP_PGOFF_TX_RING) {
878 		q = READ_ONCE(xs->tx);
879 	} else {
880 		umem = READ_ONCE(xs->umem);
881 		if (!umem)
882 			return -EINVAL;
883 
884 		/* Matches the smp_wmb() in XDP_UMEM_REG */
885 		smp_rmb();
886 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
887 			q = READ_ONCE(umem->fq);
888 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
889 			q = READ_ONCE(umem->cq);
890 	}
891 
892 	if (!q)
893 		return -EINVAL;
894 
895 	/* Matches the smp_wmb() in xsk_init_queue */
896 	smp_rmb();
897 	qpg = virt_to_head_page(q->ring);
898 	if (size > (PAGE_SIZE << compound_order(qpg)))
899 		return -EINVAL;
900 
901 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
902 	return remap_pfn_range(vma, vma->vm_start, pfn,
903 			       size, vma->vm_page_prot);
904 }
905 
906 static int xsk_notifier(struct notifier_block *this,
907 			unsigned long msg, void *ptr)
908 {
909 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
910 	struct net *net = dev_net(dev);
911 	struct sock *sk;
912 
913 	switch (msg) {
914 	case NETDEV_UNREGISTER:
915 		mutex_lock(&net->xdp.lock);
916 		sk_for_each(sk, &net->xdp.list) {
917 			struct xdp_sock *xs = xdp_sk(sk);
918 
919 			mutex_lock(&xs->mutex);
920 			if (xs->dev == dev) {
921 				sk->sk_err = ENETDOWN;
922 				if (!sock_flag(sk, SOCK_DEAD))
923 					sk->sk_error_report(sk);
924 
925 				xsk_unbind_dev(xs);
926 
927 				/* Clear device references in umem. */
928 				xdp_umem_clear_dev(xs->umem);
929 			}
930 			mutex_unlock(&xs->mutex);
931 		}
932 		mutex_unlock(&net->xdp.lock);
933 		break;
934 	}
935 	return NOTIFY_DONE;
936 }
937 
938 static struct proto xsk_proto = {
939 	.name =		"XDP",
940 	.owner =	THIS_MODULE,
941 	.obj_size =	sizeof(struct xdp_sock),
942 };
943 
944 static const struct proto_ops xsk_proto_ops = {
945 	.family		= PF_XDP,
946 	.owner		= THIS_MODULE,
947 	.release	= xsk_release,
948 	.bind		= xsk_bind,
949 	.connect	= sock_no_connect,
950 	.socketpair	= sock_no_socketpair,
951 	.accept		= sock_no_accept,
952 	.getname	= sock_no_getname,
953 	.poll		= xsk_poll,
954 	.ioctl		= sock_no_ioctl,
955 	.listen		= sock_no_listen,
956 	.shutdown	= sock_no_shutdown,
957 	.setsockopt	= xsk_setsockopt,
958 	.getsockopt	= xsk_getsockopt,
959 	.sendmsg	= xsk_sendmsg,
960 	.recvmsg	= sock_no_recvmsg,
961 	.mmap		= xsk_mmap,
962 	.sendpage	= sock_no_sendpage,
963 };
964 
965 static void xsk_destruct(struct sock *sk)
966 {
967 	struct xdp_sock *xs = xdp_sk(sk);
968 
969 	if (!sock_flag(sk, SOCK_DEAD))
970 		return;
971 
972 	xdp_put_umem(xs->umem);
973 
974 	sk_refcnt_debug_dec(sk);
975 }
976 
977 static int xsk_create(struct net *net, struct socket *sock, int protocol,
978 		      int kern)
979 {
980 	struct sock *sk;
981 	struct xdp_sock *xs;
982 
983 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
984 		return -EPERM;
985 	if (sock->type != SOCK_RAW)
986 		return -ESOCKTNOSUPPORT;
987 
988 	if (protocol)
989 		return -EPROTONOSUPPORT;
990 
991 	sock->state = SS_UNCONNECTED;
992 
993 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
994 	if (!sk)
995 		return -ENOBUFS;
996 
997 	sock->ops = &xsk_proto_ops;
998 
999 	sock_init_data(sock, sk);
1000 
1001 	sk->sk_family = PF_XDP;
1002 
1003 	sk->sk_destruct = xsk_destruct;
1004 	sk_refcnt_debug_inc(sk);
1005 
1006 	sock_set_flag(sk, SOCK_RCU_FREE);
1007 
1008 	xs = xdp_sk(sk);
1009 	xs->state = XSK_READY;
1010 	mutex_init(&xs->mutex);
1011 	spin_lock_init(&xs->rx_lock);
1012 	spin_lock_init(&xs->tx_completion_lock);
1013 
1014 	INIT_LIST_HEAD(&xs->map_list);
1015 	spin_lock_init(&xs->map_list_lock);
1016 
1017 	mutex_lock(&net->xdp.lock);
1018 	sk_add_node_rcu(sk, &net->xdp.list);
1019 	mutex_unlock(&net->xdp.lock);
1020 
1021 	local_bh_disable();
1022 	sock_prot_inuse_add(net, &xsk_proto, 1);
1023 	local_bh_enable();
1024 
1025 	return 0;
1026 }
1027 
1028 static const struct net_proto_family xsk_family_ops = {
1029 	.family = PF_XDP,
1030 	.create = xsk_create,
1031 	.owner	= THIS_MODULE,
1032 };
1033 
1034 static struct notifier_block xsk_netdev_notifier = {
1035 	.notifier_call	= xsk_notifier,
1036 };
1037 
1038 static int __net_init xsk_net_init(struct net *net)
1039 {
1040 	mutex_init(&net->xdp.lock);
1041 	INIT_HLIST_HEAD(&net->xdp.list);
1042 	return 0;
1043 }
1044 
1045 static void __net_exit xsk_net_exit(struct net *net)
1046 {
1047 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1048 }
1049 
1050 static struct pernet_operations xsk_net_ops = {
1051 	.init = xsk_net_init,
1052 	.exit = xsk_net_exit,
1053 };
1054 
1055 static int __init xsk_init(void)
1056 {
1057 	int err;
1058 
1059 	err = proto_register(&xsk_proto, 0 /* no slab */);
1060 	if (err)
1061 		goto out;
1062 
1063 	err = sock_register(&xsk_family_ops);
1064 	if (err)
1065 		goto out_proto;
1066 
1067 	err = register_pernet_subsys(&xsk_net_ops);
1068 	if (err)
1069 		goto out_sk;
1070 
1071 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1072 	if (err)
1073 		goto out_pernet;
1074 
1075 	return 0;
1076 
1077 out_pernet:
1078 	unregister_pernet_subsys(&xsk_net_ops);
1079 out_sk:
1080 	sock_unregister(PF_XDP);
1081 out_proto:
1082 	proto_unregister(&xsk_proto);
1083 out:
1084 	return err;
1085 }
1086 
1087 fs_initcall(xsk_init);
1088