xref: /linux/net/xdp/xsk.c (revision 8d72997dab65b1e9e3220302e26eaecd9b99c02f)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <linux/uio.h>
26 #include <linux/vmalloc.h>
27 
28 #include <net/netdev_queues.h>
29 #include <net/xdp_sock_drv.h>
30 #include <net/busy_poll.h>
31 #include <net/netdev_lock.h>
32 #include <net/netdev_rx_queue.h>
33 #include <net/xdp.h>
34 
35 #include "../core/dev.h"
36 
37 #include "xsk_queue.h"
38 #include "xdp_umem.h"
39 #include "xsk.h"
40 
41 #define TX_BATCH_SIZE 32
42 #define MAX_PER_SOCKET_BUDGET 32
43 
44 struct xsk_addrs {
45 	u32 num_descs;
46 	u64 addrs[MAX_SKB_FRAGS + 1];
47 };
48 
49 static struct kmem_cache *xsk_tx_generic_cache;
50 
51 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
52 {
53 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
54 		return;
55 
56 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
57 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
58 }
59 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
60 
61 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
62 {
63 	struct xdp_sock *xs;
64 
65 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
66 		return;
67 
68 	rcu_read_lock();
69 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
70 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
71 	}
72 	rcu_read_unlock();
73 
74 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
75 }
76 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
77 
78 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
79 {
80 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
81 		return;
82 
83 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
85 }
86 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
87 
88 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
89 {
90 	struct xdp_sock *xs;
91 
92 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
93 		return;
94 
95 	rcu_read_lock();
96 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
97 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
98 	}
99 	rcu_read_unlock();
100 
101 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
102 }
103 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
104 
105 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
106 {
107 	return pool->uses_need_wakeup;
108 }
109 EXPORT_SYMBOL(xsk_uses_need_wakeup);
110 
111 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
112 					    u16 queue_id)
113 {
114 	if (queue_id < dev->real_num_rx_queues)
115 		return dev->_rx[queue_id].pool;
116 	if (queue_id < dev->real_num_tx_queues)
117 		return dev->_tx[queue_id].pool;
118 
119 	return NULL;
120 }
121 EXPORT_SYMBOL(xsk_get_pool_from_qid);
122 
123 static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
124 {
125 	if (queue_id < dev->num_rx_queues)
126 		dev->_rx[queue_id].pool = NULL;
127 	if (queue_id < dev->num_tx_queues)
128 		dev->_tx[queue_id].pool = NULL;
129 }
130 
131 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
132 {
133 	struct netdev_rx_queue *hw_rxq;
134 
135 	if (!netif_rxq_is_leased(dev, queue_id))
136 		return __xsk_clear_pool_at_qid(dev, queue_id);
137 	WARN_ON_ONCE(!netif_is_queue_leasee(dev));
138 
139 	hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease;
140 
141 	netdev_lock(hw_rxq->dev);
142 	queue_id = get_netdev_rx_queue_index(hw_rxq);
143 	__xsk_clear_pool_at_qid(hw_rxq->dev, queue_id);
144 	netdev_unlock(hw_rxq->dev);
145 }
146 
147 static int __xsk_reg_pool_at_qid(struct net_device *dev,
148 				 struct xsk_buff_pool *pool, u16 queue_id)
149 {
150 	if (xsk_get_pool_from_qid(dev, queue_id))
151 		return -EBUSY;
152 
153 	if (queue_id < dev->real_num_rx_queues)
154 		dev->_rx[queue_id].pool = pool;
155 	if (queue_id < dev->real_num_tx_queues)
156 		dev->_tx[queue_id].pool = pool;
157 
158 	return 0;
159 }
160 
161 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
162  * not know if the device has more tx queues than rx, or the opposite.
163  * This might also change during run time.
164  */
165 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
166 			u16 queue_id)
167 {
168 	struct netdev_rx_queue *hw_rxq;
169 	int ret;
170 
171 	if (queue_id >= max(dev->real_num_rx_queues,
172 			    dev->real_num_tx_queues))
173 		return -EINVAL;
174 
175 	if (queue_id >= dev->real_num_rx_queues ||
176 	    !netif_rxq_is_leased(dev, queue_id))
177 		return __xsk_reg_pool_at_qid(dev, pool, queue_id);
178 	if (!netif_is_queue_leasee(dev))
179 		return -EBUSY;
180 
181 	hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease;
182 
183 	netdev_lock(hw_rxq->dev);
184 	queue_id = get_netdev_rx_queue_index(hw_rxq);
185 	ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id);
186 	netdev_unlock(hw_rxq->dev);
187 
188 	return ret;
189 }
190 
191 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
192 			u32 flags)
193 {
194 	u64 addr;
195 	int err;
196 
197 	addr = xp_get_handle(xskb, xskb->pool);
198 	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
199 	if (err) {
200 		xs->rx_queue_full++;
201 		return err;
202 	}
203 
204 	xp_release(xskb);
205 	return 0;
206 }
207 
208 static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb,
209 			      u32 len, u32 flags)
210 {
211 	u64 addr;
212 
213 	addr = xp_get_handle(xskb, xskb->pool);
214 	__xskq_prod_reserve_desc(xs->rx, addr, len, flags);
215 
216 	xp_release(xskb);
217 }
218 
219 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
220 {
221 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
222 	u32 frags = xdp_buff_has_frags(xdp);
223 	struct xdp_buff_xsk *pos, *tmp;
224 	struct list_head *xskb_list;
225 	u32 contd = 0;
226 	u32 num_desc;
227 	int err;
228 
229 	if (likely(!frags)) {
230 		err = __xsk_rcv_zc(xs, xskb, len, contd);
231 		if (err)
232 			goto err;
233 		return 0;
234 	}
235 
236 	contd = XDP_PKT_CONTD;
237 	num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1;
238 	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
239 		xs->rx_queue_full++;
240 		err = -ENOBUFS;
241 		goto err;
242 	}
243 
244 	__xsk_rcv_zc_safe(xs, xskb, len, contd);
245 	xskb_list = &xskb->pool->xskb_list;
246 	list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
247 		if (list_is_singular(xskb_list))
248 			contd = 0;
249 		len = pos->xdp.data_end - pos->xdp.data;
250 		__xsk_rcv_zc_safe(xs, pos, len, contd);
251 		list_del_init(&pos->list_node);
252 	}
253 
254 	return 0;
255 err:
256 	xsk_buff_free(xdp);
257 	return err;
258 }
259 
260 static void *xsk_copy_xdp_start(struct xdp_buff *from)
261 {
262 	if (unlikely(xdp_data_meta_unsupported(from)))
263 		return from->data;
264 	else
265 		return from->data_meta;
266 }
267 
268 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
269 			u32 *from_len, skb_frag_t **frag, u32 rem)
270 {
271 	u32 copied = 0;
272 
273 	while (1) {
274 		u32 copy_len = min_t(u32, *from_len, to_len);
275 
276 		memcpy(to, *from, copy_len);
277 		copied += copy_len;
278 		if (rem == copied)
279 			return copied;
280 
281 		if (*from_len == copy_len) {
282 			*from = skb_frag_address(*frag);
283 			*from_len = skb_frag_size((*frag)++);
284 		} else {
285 			*from += copy_len;
286 			*from_len -= copy_len;
287 		}
288 		if (to_len == copy_len)
289 			return copied;
290 
291 		to_len -= copy_len;
292 		to += copy_len;
293 	}
294 }
295 
296 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
297 {
298 	u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool);
299 	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
300 	u32 from_len, meta_len, rem, num_desc;
301 	struct xdp_buff_xsk *xskb;
302 	struct xdp_buff *xsk_xdp;
303 	skb_frag_t *frag;
304 
305 	from_len = xdp->data_end - copy_from;
306 	meta_len = xdp->data - copy_from;
307 	rem = len + meta_len;
308 
309 	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
310 		int err;
311 
312 		xsk_xdp = xsk_buff_alloc(xs->pool);
313 		if (!xsk_xdp) {
314 			xs->rx_dropped++;
315 			return -ENOMEM;
316 		}
317 		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
318 		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
319 		err = __xsk_rcv_zc(xs, xskb, len, 0);
320 		if (err) {
321 			xsk_buff_free(xsk_xdp);
322 			return err;
323 		}
324 
325 		return 0;
326 	}
327 
328 	num_desc = (len - 1) / frame_size + 1;
329 
330 	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
331 		xs->rx_dropped++;
332 		return -ENOMEM;
333 	}
334 	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
335 		xs->rx_queue_full++;
336 		return -ENOBUFS;
337 	}
338 
339 	if (xdp_buff_has_frags(xdp)) {
340 		struct skb_shared_info *sinfo;
341 
342 		sinfo = xdp_get_shared_info_from_buff(xdp);
343 		frag =  &sinfo->frags[0];
344 	}
345 
346 	do {
347 		u32 to_len = frame_size + meta_len;
348 		u32 copied;
349 
350 		xsk_xdp = xsk_buff_alloc(xs->pool);
351 		copy_to = xsk_xdp->data - meta_len;
352 
353 		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
354 		rem -= copied;
355 
356 		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
357 		__xsk_rcv_zc_safe(xs, xskb, copied - meta_len,
358 				  rem ? XDP_PKT_CONTD : 0);
359 		meta_len = 0;
360 	} while (rem);
361 
362 	return 0;
363 }
364 
365 static bool xsk_tx_writeable(struct xdp_sock *xs)
366 {
367 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
368 		return false;
369 
370 	return true;
371 }
372 
373 static void __xsk_tx_release(struct xdp_sock *xs)
374 {
375 	__xskq_cons_release(xs->tx);
376 	if (xsk_tx_writeable(xs))
377 		xs->sk.sk_write_space(&xs->sk);
378 }
379 
380 static bool xsk_is_bound(struct xdp_sock *xs)
381 {
382 	if (READ_ONCE(xs->state) == XSK_BOUND) {
383 		/* Matches smp_wmb() in bind(). */
384 		smp_rmb();
385 		return true;
386 	}
387 	return false;
388 }
389 
390 static bool xsk_dev_queue_valid(const struct xdp_sock *xs,
391 				const struct xdp_rxq_info *info)
392 {
393 	struct net_device *dev = xs->dev;
394 	u32 queue_index = xs->queue_id;
395 	struct netdev_rx_queue *rxq;
396 
397 	if (info->dev == dev &&
398 	    info->queue_index == queue_index)
399 		return true;
400 
401 	if (queue_index < dev->real_num_rx_queues) {
402 		rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease);
403 		if (!rxq)
404 			return false;
405 
406 		dev = rxq->dev;
407 		queue_index = get_netdev_rx_queue_index(rxq);
408 
409 		return info->dev == dev &&
410 		       info->queue_index == queue_index;
411 	}
412 	return false;
413 }
414 
415 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
416 {
417 	if (!xsk_is_bound(xs))
418 		return -ENXIO;
419 	if (!xsk_dev_queue_valid(xs, xdp->rxq))
420 		return -EINVAL;
421 
422 	if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
423 		xs->rx_dropped++;
424 		return -ENOSPC;
425 	}
426 
427 	return 0;
428 }
429 
430 static void xsk_flush(struct xdp_sock *xs)
431 {
432 	xskq_prod_submit(xs->rx);
433 	__xskq_cons_release(xs->pool->fq);
434 	sock_def_readable(&xs->sk);
435 }
436 
437 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
438 {
439 	u32 len = xdp_get_buff_len(xdp);
440 	int err;
441 
442 	err = xsk_rcv_check(xs, xdp, len);
443 	if (!err) {
444 		spin_lock_bh(&xs->pool->rx_lock);
445 		err = __xsk_rcv(xs, xdp, len);
446 		xsk_flush(xs);
447 		spin_unlock_bh(&xs->pool->rx_lock);
448 	}
449 
450 	return err;
451 }
452 
453 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
454 {
455 	u32 len = xdp_get_buff_len(xdp);
456 	int err;
457 
458 	err = xsk_rcv_check(xs, xdp, len);
459 	if (err)
460 		return err;
461 
462 	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
463 		len = xdp->data_end - xdp->data;
464 		return xsk_rcv_zc(xs, xdp, len);
465 	}
466 
467 	err = __xsk_rcv(xs, xdp, len);
468 	if (!err)
469 		xdp_return_buff(xdp);
470 	return err;
471 }
472 
473 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
474 {
475 	int err;
476 
477 	err = xsk_rcv(xs, xdp);
478 	if (err)
479 		return err;
480 
481 	if (!xs->flush_node.prev) {
482 		struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
483 
484 		list_add(&xs->flush_node, flush_list);
485 	}
486 
487 	return 0;
488 }
489 
490 void __xsk_map_flush(struct list_head *flush_list)
491 {
492 	struct xdp_sock *xs, *tmp;
493 
494 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
495 		xsk_flush(xs);
496 		__list_del_clearprev(&xs->flush_node);
497 	}
498 }
499 
500 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
501 {
502 	xskq_prod_submit_n(pool->cq, nb_entries);
503 }
504 EXPORT_SYMBOL(xsk_tx_completed);
505 
506 void xsk_tx_release(struct xsk_buff_pool *pool)
507 {
508 	struct xdp_sock *xs;
509 
510 	rcu_read_lock();
511 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
512 		__xsk_tx_release(xs);
513 	rcu_read_unlock();
514 }
515 EXPORT_SYMBOL(xsk_tx_release);
516 
517 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
518 {
519 	bool budget_exhausted = false;
520 	struct xdp_sock *xs;
521 
522 	rcu_read_lock();
523 again:
524 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
525 		if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
526 			budget_exhausted = true;
527 			continue;
528 		}
529 
530 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
531 			if (xskq_has_descs(xs->tx))
532 				xskq_cons_release(xs->tx);
533 			continue;
534 		}
535 
536 		xs->tx_budget_spent++;
537 
538 		/* This is the backpressure mechanism for the Tx path.
539 		 * Reserve space in the completion queue and only proceed
540 		 * if there is space in it. This avoids having to implement
541 		 * any buffering in the Tx path.
542 		 */
543 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
544 			goto out;
545 
546 		xskq_cons_release(xs->tx);
547 		rcu_read_unlock();
548 		return true;
549 	}
550 
551 	if (budget_exhausted) {
552 		list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
553 			xs->tx_budget_spent = 0;
554 
555 		budget_exhausted = false;
556 		goto again;
557 	}
558 
559 out:
560 	rcu_read_unlock();
561 	return false;
562 }
563 EXPORT_SYMBOL(xsk_tx_peek_desc);
564 
565 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
566 {
567 	struct xdp_desc *descs = pool->tx_descs;
568 	u32 nb_pkts = 0;
569 
570 	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
571 		nb_pkts++;
572 
573 	xsk_tx_release(pool);
574 	return nb_pkts;
575 }
576 
577 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
578 {
579 	struct xdp_sock *xs;
580 
581 	rcu_read_lock();
582 	if (!list_is_singular(&pool->xsk_tx_list)) {
583 		/* Fallback to the non-batched version */
584 		rcu_read_unlock();
585 		return xsk_tx_peek_release_fallback(pool, nb_pkts);
586 	}
587 
588 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
589 	if (!xs) {
590 		nb_pkts = 0;
591 		goto out;
592 	}
593 
594 	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
595 
596 	/* This is the backpressure mechanism for the Tx path. Try to
597 	 * reserve space in the completion queue for all packets, but
598 	 * if there are fewer slots available, just process that many
599 	 * packets. This avoids having to implement any buffering in
600 	 * the Tx path.
601 	 */
602 	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
603 	if (!nb_pkts)
604 		goto out;
605 
606 	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
607 	if (!nb_pkts) {
608 		xs->tx->queue_empty_descs++;
609 		goto out;
610 	}
611 
612 	__xskq_cons_release(xs->tx);
613 	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
614 	xs->sk.sk_write_space(&xs->sk);
615 
616 out:
617 	rcu_read_unlock();
618 	return nb_pkts;
619 }
620 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
621 
622 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
623 {
624 	struct net_device *dev = xs->dev;
625 
626 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
627 }
628 
629 static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
630 {
631 	int ret;
632 
633 	spin_lock(&pool->cq->cq_cached_prod_lock);
634 	ret = xskq_prod_reserve(pool->cq);
635 	spin_unlock(&pool->cq->cq_cached_prod_lock);
636 
637 	return ret;
638 }
639 
640 static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
641 {
642 	return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
643 }
644 
645 static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
646 {
647 	return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
648 }
649 
650 static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
651 {
652 	struct xsk_addrs *xsk_addr;
653 
654 	xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
655 	if (unlikely(!xsk_addr))
656 		return NULL;
657 
658 	xsk_addr->addrs[0] = addr;
659 	skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
660 	return xsk_addr;
661 }
662 
663 static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
664 {
665 	struct xsk_addrs *xsk_addr;
666 
667 	if (!xsk_skb_destructor_is_addr(skb))
668 		return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
669 
670 	xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
671 	if (likely(xsk_addr))
672 		xsk_addr->num_descs = 1;
673 	return xsk_addr;
674 }
675 
676 static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
677 {
678 	if (IS_ENABLED(CONFIG_64BIT)) {
679 		skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
680 		return 0;
681 	}
682 
683 	if (unlikely(!__xsk_addrs_alloc(skb, addr)))
684 		return -ENOMEM;
685 	return 0;
686 }
687 
688 static void xsk_inc_num_desc(struct sk_buff *skb)
689 {
690 	struct xsk_addrs *xsk_addr;
691 
692 	if (!xsk_skb_destructor_is_addr(skb)) {
693 		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
694 		xsk_addr->num_descs++;
695 	}
696 }
697 
698 static u32 xsk_get_num_desc(struct sk_buff *skb)
699 {
700 	struct xsk_addrs *xsk_addr;
701 
702 	if (xsk_skb_destructor_is_addr(skb))
703 		return 1;
704 
705 	xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
706 
707 	return xsk_addr->num_descs;
708 }
709 
710 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
711 				      struct sk_buff *skb)
712 {
713 	u32 num_descs = xsk_get_num_desc(skb);
714 	struct xsk_addrs *xsk_addr;
715 	u32 descs_processed = 0;
716 	unsigned long flags;
717 	u32 idx, i;
718 
719 	spin_lock_irqsave(&pool->cq_prod_lock, flags);
720 	idx = xskq_get_prod(pool->cq);
721 
722 	if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
723 		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
724 
725 		for (i = 0; i < num_descs; i++) {
726 			xskq_prod_write_addr(pool->cq, idx + descs_processed,
727 					     xsk_addr->addrs[i]);
728 			descs_processed++;
729 		}
730 		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
731 	} else {
732 		xskq_prod_write_addr(pool->cq, idx,
733 				     xsk_skb_destructor_get_addr(skb));
734 		descs_processed++;
735 	}
736 	xskq_prod_submit_n(pool->cq, descs_processed);
737 	spin_unlock_irqrestore(&pool->cq_prod_lock, flags);
738 }
739 
740 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
741 {
742 	spin_lock(&pool->cq->cq_cached_prod_lock);
743 	xskq_prod_cancel_n(pool->cq, n);
744 	spin_unlock(&pool->cq->cq_cached_prod_lock);
745 }
746 
747 INDIRECT_CALLABLE_SCOPE
748 void xsk_destruct_skb(struct sk_buff *skb)
749 {
750 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
751 
752 	if (compl->tx_timestamp) {
753 		/* sw completion timestamp, not a real one */
754 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
755 	}
756 
757 	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
758 	sock_wfree(skb);
759 }
760 
761 static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
762 			     u64 addr)
763 {
764 	int err;
765 
766 	err = xsk_skb_destructor_set_addr(skb, addr);
767 	if (unlikely(err))
768 		return err;
769 
770 	skb->dev = xs->dev;
771 	skb->priority = READ_ONCE(xs->sk.sk_priority);
772 	skb->mark = READ_ONCE(xs->sk.sk_mark);
773 	skb->destructor = xsk_destruct_skb;
774 	return 0;
775 }
776 
777 static void xsk_consume_skb(struct sk_buff *skb)
778 {
779 	struct xdp_sock *xs = xdp_sk(skb->sk);
780 	u32 num_descs = xsk_get_num_desc(skb);
781 	struct xsk_addrs *xsk_addr;
782 
783 	if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
784 		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
785 		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
786 	}
787 
788 	skb->destructor = sock_wfree;
789 	xsk_cq_cancel_locked(xs->pool, num_descs);
790 	/* Free skb without triggering the perf drop trace */
791 	consume_skb(skb);
792 	xs->skb = NULL;
793 }
794 
795 static void xsk_drop_skb(struct sk_buff *skb)
796 {
797 	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
798 	xsk_consume_skb(skb);
799 }
800 
801 static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
802 			    struct xdp_desc *desc, struct xsk_buff_pool *pool,
803 			    u32 hr)
804 {
805 	struct xsk_tx_metadata *meta = NULL;
806 	u16 csum_start, csum_offset;
807 
808 	if (unlikely(pool->tx_metadata_len == 0))
809 		return -EINVAL;
810 
811 	meta = buffer - pool->tx_metadata_len;
812 	if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
813 		return -EINVAL;
814 
815 	if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
816 		csum_start = READ_ONCE(meta->request.csum_start);
817 		csum_offset = READ_ONCE(meta->request.csum_offset);
818 
819 		if (unlikely(csum_start + csum_offset +
820 			     sizeof(__sum16) > desc->len))
821 			return -EINVAL;
822 
823 		skb->csum_start = hr + csum_start;
824 		skb->csum_offset = csum_offset;
825 		skb->ip_summed = CHECKSUM_PARTIAL;
826 
827 		if (unlikely(pool->tx_sw_csum)) {
828 			int err;
829 
830 			err = skb_checksum_help(skb);
831 			if (err)
832 				return err;
833 		}
834 	}
835 
836 	if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
837 		skb->skb_mstamp_ns = meta->request.launch_time;
838 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
839 
840 	return 0;
841 }
842 
843 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
844 					      struct xdp_desc *desc)
845 {
846 	struct xsk_buff_pool *pool = xs->pool;
847 	u32 hr, len, ts, offset, copy, copied;
848 	struct sk_buff *skb = xs->skb;
849 	struct page *page;
850 	void *buffer;
851 	int err, i;
852 	u64 addr;
853 
854 	addr = desc->addr;
855 	buffer = xsk_buff_raw_get_data(pool, addr);
856 
857 	if (!skb) {
858 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
859 
860 		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
861 		if (unlikely(!skb))
862 			return ERR_PTR(err);
863 
864 		skb_reserve(skb, hr);
865 		if (desc->options & XDP_TX_METADATA) {
866 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
867 			if (unlikely(err)) {
868 				kfree_skb(skb);
869 				return ERR_PTR(err);
870 			}
871 		}
872 	} else {
873 		struct xsk_addrs *xsk_addr;
874 
875 		xsk_addr = xsk_addrs_alloc(skb);
876 		if (!xsk_addr)
877 			return ERR_PTR(-ENOMEM);
878 
879 		/* in case of -EOVERFLOW that could happen below,
880 		 * xsk_consume_skb() will release this node as whole skb
881 		 * would be dropped, which implies freeing all list elements
882 		 */
883 		xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
884 	}
885 
886 	len = desc->len;
887 	ts = pool->unaligned ? len : pool->chunk_size;
888 
889 	offset = offset_in_page(buffer);
890 	addr = buffer - pool->addrs;
891 
892 	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
893 		if (unlikely(i >= MAX_SKB_FRAGS)) {
894 			if (!xs->skb)
895 				kfree_skb(skb);
896 			return ERR_PTR(-EOVERFLOW);
897 		}
898 
899 		page = pool->umem->pgs[addr >> PAGE_SHIFT];
900 		get_page(page);
901 
902 		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
903 		skb_fill_page_desc(skb, i, page, offset, copy);
904 
905 		copied += copy;
906 		addr += copy;
907 		offset = 0;
908 	}
909 
910 	skb->len += len;
911 	skb->data_len += len;
912 	skb->truesize += ts;
913 
914 	refcount_add(ts, &xs->sk.sk_wmem_alloc);
915 
916 	return skb;
917 }
918 
919 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
920 				     struct xdp_desc *desc)
921 {
922 	struct net_device *dev = xs->dev;
923 	struct sk_buff *skb = xs->skb;
924 	int err;
925 
926 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
927 		skb = xsk_build_skb_zerocopy(xs, desc);
928 		if (IS_ERR(skb)) {
929 			err = PTR_ERR(skb);
930 			skb = NULL;
931 			goto free_err;
932 		}
933 	} else {
934 		u32 hr, tr, len;
935 		void *buffer;
936 
937 		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
938 		len = desc->len;
939 
940 		if (!skb) {
941 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
942 			tr = dev->needed_tailroom;
943 			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
944 			if (unlikely(!skb))
945 				goto free_err;
946 
947 			skb_reserve(skb, hr);
948 			skb_put(skb, len);
949 
950 			err = skb_store_bits(skb, 0, buffer, len);
951 			if (unlikely(err))
952 				goto free_err;
953 
954 			if (desc->options & XDP_TX_METADATA) {
955 				err = xsk_skb_metadata(skb, buffer, desc,
956 						       xs->pool, hr);
957 				if (unlikely(err))
958 					goto free_err;
959 			}
960 		} else {
961 			int nr_frags = skb_shinfo(skb)->nr_frags;
962 			struct xsk_addrs *xsk_addr;
963 			struct page *page;
964 			u8 *vaddr;
965 
966 			xsk_addr = xsk_addrs_alloc(skb);
967 			if (!xsk_addr) {
968 				err = -ENOMEM;
969 				goto free_err;
970 			}
971 
972 			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
973 				err = -EOVERFLOW;
974 				goto free_err;
975 			}
976 
977 			page = alloc_page(xs->sk.sk_allocation);
978 			if (unlikely(!page)) {
979 				err = -EAGAIN;
980 				goto free_err;
981 			}
982 
983 			vaddr = kmap_local_page(page);
984 			memcpy(vaddr, buffer, len);
985 			kunmap_local(vaddr);
986 
987 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
988 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
989 
990 			xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
991 		}
992 	}
993 
994 	if (!xs->skb) {
995 		err = xsk_skb_init_misc(skb, xs, desc->addr);
996 		if (unlikely(err))
997 			goto free_err;
998 	}
999 	xsk_inc_num_desc(skb);
1000 
1001 	return skb;
1002 
1003 free_err:
1004 	if (skb && !xs->skb)
1005 		kfree_skb(skb);
1006 
1007 	if (err == -EOVERFLOW) {
1008 		if (xs->skb) {
1009 			/* Drop the packet */
1010 			xsk_inc_num_desc(xs->skb);
1011 			xsk_drop_skb(xs->skb);
1012 		} else {
1013 			xsk_cq_cancel_locked(xs->pool, 1);
1014 			xs->tx->invalid_descs++;
1015 		}
1016 		xskq_cons_release(xs->tx);
1017 	} else {
1018 		/* Let application retry */
1019 		xsk_cq_cancel_locked(xs->pool, 1);
1020 	}
1021 
1022 	return ERR_PTR(err);
1023 }
1024 
1025 static int __xsk_generic_xmit(struct sock *sk)
1026 {
1027 	struct xdp_sock *xs = xdp_sk(sk);
1028 	bool sent_frame = false;
1029 	struct xdp_desc desc;
1030 	struct sk_buff *skb;
1031 	u32 max_batch;
1032 	int err = 0;
1033 
1034 	mutex_lock(&xs->mutex);
1035 
1036 	/* Since we dropped the RCU read lock, the socket state might have changed. */
1037 	if (unlikely(!xsk_is_bound(xs))) {
1038 		err = -ENXIO;
1039 		goto out;
1040 	}
1041 
1042 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
1043 		goto out;
1044 
1045 	max_batch = READ_ONCE(xs->max_tx_budget);
1046 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
1047 		if (max_batch-- == 0) {
1048 			err = -EAGAIN;
1049 			goto out;
1050 		}
1051 
1052 		/* This is the backpressure mechanism for the Tx path.
1053 		 * Reserve space in the completion queue and only proceed
1054 		 * if there is space in it. This avoids having to implement
1055 		 * any buffering in the Tx path.
1056 		 */
1057 		err = xsk_cq_reserve_locked(xs->pool);
1058 		if (err) {
1059 			err = -EAGAIN;
1060 			goto out;
1061 		}
1062 
1063 		skb = xsk_build_skb(xs, &desc);
1064 		if (IS_ERR(skb)) {
1065 			err = PTR_ERR(skb);
1066 			if (err != -EOVERFLOW)
1067 				goto out;
1068 			err = 0;
1069 			continue;
1070 		}
1071 
1072 		xskq_cons_release(xs->tx);
1073 
1074 		if (xp_mb_desc(&desc)) {
1075 			xs->skb = skb;
1076 			continue;
1077 		}
1078 
1079 		err = __dev_direct_xmit(skb, xs->queue_id);
1080 		if  (err == NETDEV_TX_BUSY) {
1081 			/* Tell user-space to retry the send */
1082 			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
1083 			xsk_consume_skb(skb);
1084 			err = -EAGAIN;
1085 			goto out;
1086 		}
1087 
1088 		/* Ignore NET_XMIT_CN as packet might have been sent */
1089 		if (err == NET_XMIT_DROP) {
1090 			/* SKB completed but not sent */
1091 			err = -EBUSY;
1092 			xs->skb = NULL;
1093 			goto out;
1094 		}
1095 
1096 		sent_frame = true;
1097 		xs->skb = NULL;
1098 	}
1099 
1100 	if (xskq_has_descs(xs->tx)) {
1101 		if (xs->skb)
1102 			xsk_drop_skb(xs->skb);
1103 		xskq_cons_release(xs->tx);
1104 	}
1105 
1106 out:
1107 	if (sent_frame)
1108 		__xsk_tx_release(xs);
1109 
1110 	mutex_unlock(&xs->mutex);
1111 	return err;
1112 }
1113 
1114 static int xsk_generic_xmit(struct sock *sk)
1115 {
1116 	int ret;
1117 
1118 	/* Drop the RCU lock since the SKB path might sleep. */
1119 	rcu_read_unlock();
1120 	ret = __xsk_generic_xmit(sk);
1121 	/* Reaquire RCU lock before going into common code. */
1122 	rcu_read_lock();
1123 
1124 	return ret;
1125 }
1126 
1127 static bool xsk_no_wakeup(struct sock *sk)
1128 {
1129 #ifdef CONFIG_NET_RX_BUSY_POLL
1130 	/* Prefer busy-polling, skip the wakeup. */
1131 	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
1132 		napi_id_valid(READ_ONCE(sk->sk_napi_id));
1133 #else
1134 	return false;
1135 #endif
1136 }
1137 
1138 static int xsk_check_common(struct xdp_sock *xs)
1139 {
1140 	if (unlikely(!xsk_is_bound(xs)))
1141 		return -ENXIO;
1142 	if (unlikely(!(xs->dev->flags & IFF_UP)))
1143 		return -ENETDOWN;
1144 
1145 	return 0;
1146 }
1147 
1148 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
1149 {
1150 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
1151 	struct sock *sk = sock->sk;
1152 	struct xdp_sock *xs = xdp_sk(sk);
1153 	struct xsk_buff_pool *pool;
1154 	int err;
1155 
1156 	err = xsk_check_common(xs);
1157 	if (err)
1158 		return err;
1159 	if (unlikely(need_wait))
1160 		return -EOPNOTSUPP;
1161 	if (unlikely(!xs->tx))
1162 		return -ENOBUFS;
1163 
1164 	if (sk_can_busy_loop(sk))
1165 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
1166 
1167 	if (xs->zc && xsk_no_wakeup(sk))
1168 		return 0;
1169 
1170 	pool = xs->pool;
1171 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
1172 		if (xs->zc)
1173 			return xsk_wakeup(xs, XDP_WAKEUP_TX);
1174 		return xsk_generic_xmit(sk);
1175 	}
1176 	return 0;
1177 }
1178 
1179 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
1180 {
1181 	int ret;
1182 
1183 	rcu_read_lock();
1184 	ret = __xsk_sendmsg(sock, m, total_len);
1185 	rcu_read_unlock();
1186 
1187 	return ret;
1188 }
1189 
1190 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
1191 {
1192 	bool need_wait = !(flags & MSG_DONTWAIT);
1193 	struct sock *sk = sock->sk;
1194 	struct xdp_sock *xs = xdp_sk(sk);
1195 	int err;
1196 
1197 	err = xsk_check_common(xs);
1198 	if (err)
1199 		return err;
1200 	if (unlikely(!xs->rx))
1201 		return -ENOBUFS;
1202 	if (unlikely(need_wait))
1203 		return -EOPNOTSUPP;
1204 
1205 	if (sk_can_busy_loop(sk))
1206 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
1207 
1208 	if (xsk_no_wakeup(sk))
1209 		return 0;
1210 
1211 	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
1212 		return xsk_wakeup(xs, XDP_WAKEUP_RX);
1213 	return 0;
1214 }
1215 
1216 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
1217 {
1218 	int ret;
1219 
1220 	rcu_read_lock();
1221 	ret = __xsk_recvmsg(sock, m, len, flags);
1222 	rcu_read_unlock();
1223 
1224 	return ret;
1225 }
1226 
1227 static __poll_t xsk_poll(struct file *file, struct socket *sock,
1228 			     struct poll_table_struct *wait)
1229 {
1230 	__poll_t mask = 0;
1231 	struct sock *sk = sock->sk;
1232 	struct xdp_sock *xs = xdp_sk(sk);
1233 	struct xsk_buff_pool *pool;
1234 
1235 	sock_poll_wait(file, sock, wait);
1236 
1237 	rcu_read_lock();
1238 	if (xsk_check_common(xs))
1239 		goto out;
1240 
1241 	pool = xs->pool;
1242 
1243 	if (pool->cached_need_wakeup) {
1244 		if (xs->zc)
1245 			xsk_wakeup(xs, pool->cached_need_wakeup);
1246 		else if (xs->tx)
1247 			/* Poll needs to drive Tx also in copy mode */
1248 			xsk_generic_xmit(sk);
1249 	}
1250 
1251 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
1252 		mask |= EPOLLIN | EPOLLRDNORM;
1253 	if (xs->tx && xsk_tx_writeable(xs))
1254 		mask |= EPOLLOUT | EPOLLWRNORM;
1255 out:
1256 	rcu_read_unlock();
1257 	return mask;
1258 }
1259 
1260 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
1261 			  bool umem_queue)
1262 {
1263 	struct xsk_queue *q;
1264 
1265 	if (entries == 0 || *queue || !is_power_of_2(entries))
1266 		return -EINVAL;
1267 
1268 	q = xskq_create(entries, umem_queue);
1269 	if (!q)
1270 		return -ENOMEM;
1271 
1272 	/* Make sure queue is ready before it can be seen by others */
1273 	smp_wmb();
1274 	WRITE_ONCE(*queue, q);
1275 	return 0;
1276 }
1277 
1278 static void xsk_unbind_dev(struct xdp_sock *xs)
1279 {
1280 	struct net_device *dev = xs->dev;
1281 
1282 	if (xs->state != XSK_BOUND)
1283 		return;
1284 	WRITE_ONCE(xs->state, XSK_UNBOUND);
1285 
1286 	/* Wait for driver to stop using the xdp socket. */
1287 	xp_del_xsk(xs->pool, xs);
1288 	synchronize_net();
1289 	dev_put(dev);
1290 }
1291 
1292 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
1293 					      struct xdp_sock __rcu ***map_entry)
1294 {
1295 	struct xsk_map *map = NULL;
1296 	struct xsk_map_node *node;
1297 
1298 	*map_entry = NULL;
1299 
1300 	spin_lock_bh(&xs->map_list_lock);
1301 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
1302 					node);
1303 	if (node) {
1304 		bpf_map_inc(&node->map->map);
1305 		map = node->map;
1306 		*map_entry = node->map_entry;
1307 	}
1308 	spin_unlock_bh(&xs->map_list_lock);
1309 	return map;
1310 }
1311 
1312 static void xsk_delete_from_maps(struct xdp_sock *xs)
1313 {
1314 	/* This function removes the current XDP socket from all the
1315 	 * maps it resides in. We need to take extra care here, due to
1316 	 * the two locks involved. Each map has a lock synchronizing
1317 	 * updates to the entries, and each socket has a lock that
1318 	 * synchronizes access to the list of maps (map_list). For
1319 	 * deadlock avoidance the locks need to be taken in the order
1320 	 * "map lock"->"socket map list lock". We start off by
1321 	 * accessing the socket map list, and take a reference to the
1322 	 * map to guarantee existence between the
1323 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
1324 	 * calls. Then we ask the map to remove the socket, which
1325 	 * tries to remove the socket from the map. Note that there
1326 	 * might be updates to the map between
1327 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
1328 	 */
1329 	struct xdp_sock __rcu **map_entry = NULL;
1330 	struct xsk_map *map;
1331 
1332 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
1333 		xsk_map_try_sock_delete(map, xs, map_entry);
1334 		bpf_map_put(&map->map);
1335 	}
1336 }
1337 
1338 static int xsk_release(struct socket *sock)
1339 {
1340 	struct sock *sk = sock->sk;
1341 	struct xdp_sock *xs = xdp_sk(sk);
1342 	struct net *net;
1343 
1344 	if (!sk)
1345 		return 0;
1346 
1347 	net = sock_net(sk);
1348 
1349 	if (xs->skb)
1350 		xsk_drop_skb(xs->skb);
1351 
1352 	mutex_lock(&net->xdp.lock);
1353 	sk_del_node_init_rcu(sk);
1354 	mutex_unlock(&net->xdp.lock);
1355 
1356 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1357 
1358 	xsk_delete_from_maps(xs);
1359 	mutex_lock(&xs->mutex);
1360 	xsk_unbind_dev(xs);
1361 	mutex_unlock(&xs->mutex);
1362 
1363 	xskq_destroy(xs->rx);
1364 	xskq_destroy(xs->tx);
1365 	xskq_destroy(xs->fq_tmp);
1366 	xskq_destroy(xs->cq_tmp);
1367 
1368 	sock_orphan(sk);
1369 	sock->sk = NULL;
1370 
1371 	sock_put(sk);
1372 
1373 	return 0;
1374 }
1375 
1376 static struct socket *xsk_lookup_xsk_from_fd(int fd)
1377 {
1378 	struct socket *sock;
1379 	int err;
1380 
1381 	sock = sockfd_lookup(fd, &err);
1382 	if (!sock)
1383 		return ERR_PTR(-ENOTSOCK);
1384 
1385 	if (sock->sk->sk_family != PF_XDP) {
1386 		sockfd_put(sock);
1387 		return ERR_PTR(-ENOPROTOOPT);
1388 	}
1389 
1390 	return sock;
1391 }
1392 
1393 static bool xsk_validate_queues(struct xdp_sock *xs)
1394 {
1395 	return xs->fq_tmp && xs->cq_tmp;
1396 }
1397 
1398 static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
1399 {
1400 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1401 	struct sock *sk = sock->sk;
1402 	struct xdp_sock *xs = xdp_sk(sk);
1403 	struct net_device *dev;
1404 	int bound_dev_if;
1405 	u32 flags, qid;
1406 	int err = 0;
1407 
1408 	if (addr_len < sizeof(struct sockaddr_xdp))
1409 		return -EINVAL;
1410 	if (sxdp->sxdp_family != AF_XDP)
1411 		return -EINVAL;
1412 
1413 	flags = sxdp->sxdp_flags;
1414 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
1415 		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1416 		return -EINVAL;
1417 
1418 	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1419 	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1420 		return -EINVAL;
1421 
1422 	rtnl_lock();
1423 	mutex_lock(&xs->mutex);
1424 	if (xs->state != XSK_READY) {
1425 		err = -EBUSY;
1426 		goto out_release;
1427 	}
1428 
1429 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1430 	if (!dev) {
1431 		err = -ENODEV;
1432 		goto out_release;
1433 	}
1434 
1435 	netdev_lock_ops(dev);
1436 
1437 	if (!xs->rx && !xs->tx) {
1438 		err = -EINVAL;
1439 		goto out_unlock;
1440 	}
1441 
1442 	qid = sxdp->sxdp_queue_id;
1443 
1444 	if (flags & XDP_SHARED_UMEM) {
1445 		struct xdp_sock *umem_xs;
1446 		struct socket *sock;
1447 
1448 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
1449 		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1450 			/* Cannot specify flags for shared sockets. */
1451 			err = -EINVAL;
1452 			goto out_unlock;
1453 		}
1454 
1455 		if (xs->umem) {
1456 			/* We have already our own. */
1457 			err = -EINVAL;
1458 			goto out_unlock;
1459 		}
1460 
1461 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1462 		if (IS_ERR(sock)) {
1463 			err = PTR_ERR(sock);
1464 			goto out_unlock;
1465 		}
1466 
1467 		umem_xs = xdp_sk(sock->sk);
1468 		if (!xsk_is_bound(umem_xs)) {
1469 			err = -EBADF;
1470 			sockfd_put(sock);
1471 			goto out_unlock;
1472 		}
1473 
1474 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1475 			/* One fill and completion ring required for each queue id. */
1476 			if (!xsk_validate_queues(xs)) {
1477 				err = -EINVAL;
1478 				sockfd_put(sock);
1479 				goto out_unlock;
1480 			}
1481 
1482 			/* Share the umem with another socket on another qid
1483 			 * and/or device.
1484 			 */
1485 			xs->pool = xp_create_and_assign_umem(xs,
1486 							     umem_xs->umem);
1487 			if (!xs->pool) {
1488 				err = -ENOMEM;
1489 				sockfd_put(sock);
1490 				goto out_unlock;
1491 			}
1492 
1493 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
1494 						   qid);
1495 			if (err) {
1496 				xp_destroy(xs->pool);
1497 				xs->pool = NULL;
1498 				sockfd_put(sock);
1499 				goto out_unlock;
1500 			}
1501 		} else {
1502 			/* Share the buffer pool with the other socket. */
1503 			if (xs->fq_tmp || xs->cq_tmp) {
1504 				/* Do not allow setting your own fq or cq. */
1505 				err = -EINVAL;
1506 				sockfd_put(sock);
1507 				goto out_unlock;
1508 			}
1509 
1510 			xp_get_pool(umem_xs->pool);
1511 			xs->pool = umem_xs->pool;
1512 
1513 			/* If underlying shared umem was created without Tx
1514 			 * ring, allocate Tx descs array that Tx batching API
1515 			 * utilizes
1516 			 */
1517 			if (xs->tx && !xs->pool->tx_descs) {
1518 				err = xp_alloc_tx_descs(xs->pool, xs);
1519 				if (err) {
1520 					xp_put_pool(xs->pool);
1521 					xs->pool = NULL;
1522 					sockfd_put(sock);
1523 					goto out_unlock;
1524 				}
1525 			}
1526 		}
1527 
1528 		xdp_get_umem(umem_xs->umem);
1529 		WRITE_ONCE(xs->umem, umem_xs->umem);
1530 		sockfd_put(sock);
1531 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1532 		err = -EINVAL;
1533 		goto out_unlock;
1534 	} else {
1535 		/* This xsk has its own umem. */
1536 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1537 		if (!xs->pool) {
1538 			err = -ENOMEM;
1539 			goto out_unlock;
1540 		}
1541 
1542 		err = xp_assign_dev(xs->pool, dev, qid, flags);
1543 		if (err) {
1544 			xp_destroy(xs->pool);
1545 			xs->pool = NULL;
1546 			goto out_unlock;
1547 		}
1548 	}
1549 
1550 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1551 	xs->fq_tmp = NULL;
1552 	xs->cq_tmp = NULL;
1553 
1554 	xs->dev = dev;
1555 	xs->zc = xs->umem->zc;
1556 	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1557 	xs->queue_id = qid;
1558 	xp_add_xsk(xs->pool, xs);
1559 
1560 	if (qid < dev->real_num_rx_queues) {
1561 		struct netdev_rx_queue *rxq;
1562 
1563 		rxq = __netif_get_rx_queue(dev, qid);
1564 		if (rxq->napi)
1565 			__sk_mark_napi_id_once(sk, rxq->napi->napi_id);
1566 	}
1567 
1568 out_unlock:
1569 	if (err) {
1570 		dev_put(dev);
1571 	} else {
1572 		/* Matches smp_rmb() in bind() for shared umem
1573 		 * sockets, and xsk_is_bound().
1574 		 */
1575 		smp_wmb();
1576 		WRITE_ONCE(xs->state, XSK_BOUND);
1577 	}
1578 	netdev_unlock_ops(dev);
1579 out_release:
1580 	mutex_unlock(&xs->mutex);
1581 	rtnl_unlock();
1582 	return err;
1583 }
1584 
1585 struct xdp_umem_reg_v1 {
1586 	__u64 addr; /* Start of packet data area */
1587 	__u64 len; /* Length of packet data area */
1588 	__u32 chunk_size;
1589 	__u32 headroom;
1590 };
1591 
1592 static int xsk_setsockopt(struct socket *sock, int level, int optname,
1593 			  sockptr_t optval, unsigned int optlen)
1594 {
1595 	struct sock *sk = sock->sk;
1596 	struct xdp_sock *xs = xdp_sk(sk);
1597 	int err;
1598 
1599 	if (level != SOL_XDP)
1600 		return -ENOPROTOOPT;
1601 
1602 	switch (optname) {
1603 	case XDP_RX_RING:
1604 	case XDP_TX_RING:
1605 	{
1606 		struct xsk_queue **q;
1607 		int entries;
1608 
1609 		if (optlen < sizeof(entries))
1610 			return -EINVAL;
1611 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1612 			return -EFAULT;
1613 
1614 		mutex_lock(&xs->mutex);
1615 		if (xs->state != XSK_READY) {
1616 			mutex_unlock(&xs->mutex);
1617 			return -EBUSY;
1618 		}
1619 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1620 		err = xsk_init_queue(entries, q, false);
1621 		if (!err && optname == XDP_TX_RING)
1622 			/* Tx needs to be explicitly woken up the first time */
1623 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1624 		mutex_unlock(&xs->mutex);
1625 		return err;
1626 	}
1627 	case XDP_UMEM_REG:
1628 	{
1629 		size_t mr_size = sizeof(struct xdp_umem_reg);
1630 		struct xdp_umem_reg mr = {};
1631 		struct xdp_umem *umem;
1632 
1633 		if (optlen < sizeof(struct xdp_umem_reg_v1))
1634 			return -EINVAL;
1635 		else if (optlen < sizeof(mr))
1636 			mr_size = sizeof(struct xdp_umem_reg_v1);
1637 
1638 		BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg));
1639 
1640 		/* Make sure the last field of the struct doesn't have
1641 		 * uninitialized padding. All padding has to be explicit
1642 		 * and has to be set to zero by the userspace to make
1643 		 * struct xdp_umem_reg extensible in the future.
1644 		 */
1645 		BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) +
1646 			     sizeof_field(struct xdp_umem_reg, tx_metadata_len) !=
1647 			     sizeof(struct xdp_umem_reg));
1648 
1649 		if (copy_from_sockptr(&mr, optval, mr_size))
1650 			return -EFAULT;
1651 
1652 		mutex_lock(&xs->mutex);
1653 		if (xs->state != XSK_READY || xs->umem) {
1654 			mutex_unlock(&xs->mutex);
1655 			return -EBUSY;
1656 		}
1657 
1658 		umem = xdp_umem_create(&mr);
1659 		if (IS_ERR(umem)) {
1660 			mutex_unlock(&xs->mutex);
1661 			return PTR_ERR(umem);
1662 		}
1663 
1664 		/* Make sure umem is ready before it can be seen by others */
1665 		smp_wmb();
1666 		WRITE_ONCE(xs->umem, umem);
1667 		mutex_unlock(&xs->mutex);
1668 		return 0;
1669 	}
1670 	case XDP_UMEM_FILL_RING:
1671 	case XDP_UMEM_COMPLETION_RING:
1672 	{
1673 		struct xsk_queue **q;
1674 		int entries;
1675 
1676 		if (optlen < sizeof(entries))
1677 			return -EINVAL;
1678 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1679 			return -EFAULT;
1680 
1681 		mutex_lock(&xs->mutex);
1682 		if (xs->state != XSK_READY) {
1683 			mutex_unlock(&xs->mutex);
1684 			return -EBUSY;
1685 		}
1686 
1687 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1688 			&xs->cq_tmp;
1689 		err = xsk_init_queue(entries, q, true);
1690 		mutex_unlock(&xs->mutex);
1691 		return err;
1692 	}
1693 	case XDP_MAX_TX_SKB_BUDGET:
1694 	{
1695 		unsigned int budget;
1696 
1697 		if (optlen != sizeof(budget))
1698 			return -EINVAL;
1699 		if (copy_from_sockptr(&budget, optval, sizeof(budget)))
1700 			return -EFAULT;
1701 		if (!xs->tx ||
1702 		    budget < TX_BATCH_SIZE || budget > xs->tx->nentries)
1703 			return -EACCES;
1704 
1705 		WRITE_ONCE(xs->max_tx_budget, budget);
1706 		return 0;
1707 	}
1708 	default:
1709 		break;
1710 	}
1711 
1712 	return -ENOPROTOOPT;
1713 }
1714 
1715 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1716 {
1717 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1718 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1719 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1720 }
1721 
1722 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1723 {
1724 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1725 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1726 	ring->desc = offsetof(struct xdp_umem_ring, desc);
1727 }
1728 
1729 struct xdp_statistics_v1 {
1730 	__u64 rx_dropped;
1731 	__u64 rx_invalid_descs;
1732 	__u64 tx_invalid_descs;
1733 };
1734 
1735 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1736 			  sockopt_t *opt)
1737 {
1738 	struct sock *sk = sock->sk;
1739 	struct xdp_sock *xs = xdp_sk(sk);
1740 	int len;
1741 
1742 	if (level != SOL_XDP)
1743 		return -ENOPROTOOPT;
1744 
1745 	len = opt->optlen;
1746 	if (len < 0)
1747 		return -EINVAL;
1748 
1749 	switch (optname) {
1750 	case XDP_STATISTICS:
1751 	{
1752 		struct xdp_statistics stats = {};
1753 		bool extra_stats = true;
1754 		size_t stats_size;
1755 
1756 		if (len < sizeof(struct xdp_statistics_v1)) {
1757 			return -EINVAL;
1758 		} else if (len < sizeof(stats)) {
1759 			extra_stats = false;
1760 			stats_size = sizeof(struct xdp_statistics_v1);
1761 		} else {
1762 			stats_size = sizeof(stats);
1763 		}
1764 
1765 		mutex_lock(&xs->mutex);
1766 		stats.rx_dropped = xs->rx_dropped;
1767 		if (extra_stats) {
1768 			stats.rx_ring_full = xs->rx_queue_full;
1769 			stats.rx_fill_ring_empty_descs =
1770 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1771 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1772 		} else {
1773 			stats.rx_dropped += xs->rx_queue_full;
1774 		}
1775 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1776 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1777 		mutex_unlock(&xs->mutex);
1778 
1779 		if (copy_to_iter(&stats, stats_size, &opt->iter_out) !=
1780 		    stats_size)
1781 			return -EFAULT;
1782 		opt->optlen = stats_size;
1783 
1784 		return 0;
1785 	}
1786 	case XDP_MMAP_OFFSETS:
1787 	{
1788 		struct xdp_mmap_offsets off;
1789 		struct xdp_mmap_offsets_v1 off_v1;
1790 		bool flags_supported = true;
1791 		void *to_copy;
1792 
1793 		if (len < sizeof(off_v1))
1794 			return -EINVAL;
1795 		else if (len < sizeof(off))
1796 			flags_supported = false;
1797 
1798 		if (flags_supported) {
1799 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1800 			 * except for the flags field added to the end.
1801 			 */
1802 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1803 					       &off.rx);
1804 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1805 					       &off.tx);
1806 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1807 					       &off.fr);
1808 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1809 					       &off.cr);
1810 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1811 						ptrs.flags);
1812 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1813 						ptrs.flags);
1814 			off.fr.flags = offsetof(struct xdp_umem_ring,
1815 						ptrs.flags);
1816 			off.cr.flags = offsetof(struct xdp_umem_ring,
1817 						ptrs.flags);
1818 
1819 			len = sizeof(off);
1820 			to_copy = &off;
1821 		} else {
1822 			xsk_enter_rxtx_offsets(&off_v1.rx);
1823 			xsk_enter_rxtx_offsets(&off_v1.tx);
1824 			xsk_enter_umem_offsets(&off_v1.fr);
1825 			xsk_enter_umem_offsets(&off_v1.cr);
1826 
1827 			len = sizeof(off_v1);
1828 			to_copy = &off_v1;
1829 		}
1830 
1831 		if (copy_to_iter(to_copy, len, &opt->iter_out) != len)
1832 			return -EFAULT;
1833 		opt->optlen = len;
1834 
1835 		return 0;
1836 	}
1837 	case XDP_OPTIONS:
1838 	{
1839 		struct xdp_options opts = {};
1840 
1841 		if (len < sizeof(opts))
1842 			return -EINVAL;
1843 
1844 		mutex_lock(&xs->mutex);
1845 		if (xs->zc)
1846 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1847 		mutex_unlock(&xs->mutex);
1848 
1849 		len = sizeof(opts);
1850 		if (copy_to_iter(&opts, len, &opt->iter_out) != len)
1851 			return -EFAULT;
1852 		opt->optlen = len;
1853 
1854 		return 0;
1855 	}
1856 	default:
1857 		break;
1858 	}
1859 
1860 	return -EOPNOTSUPP;
1861 }
1862 
1863 static int xsk_mmap(struct file *file, struct socket *sock,
1864 		    struct vm_area_struct *vma)
1865 {
1866 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1867 	unsigned long size = vma->vm_end - vma->vm_start;
1868 	struct xdp_sock *xs = xdp_sk(sock->sk);
1869 	int state = READ_ONCE(xs->state);
1870 	struct xsk_queue *q = NULL;
1871 
1872 	if (state != XSK_READY && state != XSK_BOUND)
1873 		return -EBUSY;
1874 
1875 	if (offset == XDP_PGOFF_RX_RING) {
1876 		q = READ_ONCE(xs->rx);
1877 	} else if (offset == XDP_PGOFF_TX_RING) {
1878 		q = READ_ONCE(xs->tx);
1879 	} else {
1880 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1881 		smp_rmb();
1882 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1883 			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
1884 						 READ_ONCE(xs->pool->fq);
1885 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1886 			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
1887 						 READ_ONCE(xs->pool->cq);
1888 	}
1889 
1890 	if (!q)
1891 		return -EINVAL;
1892 
1893 	/* Matches the smp_wmb() in xsk_init_queue */
1894 	smp_rmb();
1895 	if (size > q->ring_vmalloc_size)
1896 		return -EINVAL;
1897 
1898 	return remap_vmalloc_range(vma, q->ring, 0);
1899 }
1900 
1901 static int xsk_notifier(struct notifier_block *this,
1902 			unsigned long msg, void *ptr)
1903 {
1904 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1905 	struct net *net = dev_net(dev);
1906 	struct sock *sk;
1907 
1908 	switch (msg) {
1909 	case NETDEV_UNREGISTER:
1910 		mutex_lock(&net->xdp.lock);
1911 		sk_for_each(sk, &net->xdp.list) {
1912 			struct xdp_sock *xs = xdp_sk(sk);
1913 
1914 			mutex_lock(&xs->mutex);
1915 			if (xs->dev == dev) {
1916 				sk->sk_err = ENETDOWN;
1917 				if (!sock_flag(sk, SOCK_DEAD))
1918 					sk_error_report(sk);
1919 
1920 				xsk_unbind_dev(xs);
1921 
1922 				/* Clear device references. */
1923 				xp_clear_dev(xs->pool);
1924 			}
1925 			mutex_unlock(&xs->mutex);
1926 		}
1927 		mutex_unlock(&net->xdp.lock);
1928 		break;
1929 	}
1930 	return NOTIFY_DONE;
1931 }
1932 
1933 static struct proto xsk_proto = {
1934 	.name =		"XDP",
1935 	.owner =	THIS_MODULE,
1936 	.obj_size =	sizeof(struct xdp_sock),
1937 };
1938 
1939 static const struct proto_ops xsk_proto_ops = {
1940 	.family		= PF_XDP,
1941 	.owner		= THIS_MODULE,
1942 	.release	= xsk_release,
1943 	.bind		= xsk_bind,
1944 	.connect	= sock_no_connect,
1945 	.socketpair	= sock_no_socketpair,
1946 	.accept		= sock_no_accept,
1947 	.getname	= sock_no_getname,
1948 	.poll		= xsk_poll,
1949 	.ioctl		= sock_no_ioctl,
1950 	.listen		= sock_no_listen,
1951 	.shutdown	= sock_no_shutdown,
1952 	.setsockopt	= xsk_setsockopt,
1953 	.getsockopt_iter = xsk_getsockopt,
1954 	.sendmsg	= xsk_sendmsg,
1955 	.recvmsg	= xsk_recvmsg,
1956 	.mmap		= xsk_mmap,
1957 };
1958 
1959 static void xsk_destruct(struct sock *sk)
1960 {
1961 	struct xdp_sock *xs = xdp_sk(sk);
1962 
1963 	if (!sock_flag(sk, SOCK_DEAD))
1964 		return;
1965 
1966 	if (!xp_put_pool(xs->pool))
1967 		xdp_put_umem(xs->umem, !xs->pool);
1968 }
1969 
1970 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1971 		      int kern)
1972 {
1973 	struct xdp_sock *xs;
1974 	struct sock *sk;
1975 
1976 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1977 		return -EPERM;
1978 	if (sock->type != SOCK_RAW)
1979 		return -ESOCKTNOSUPPORT;
1980 
1981 	if (protocol)
1982 		return -EPROTONOSUPPORT;
1983 
1984 	sock->state = SS_UNCONNECTED;
1985 
1986 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1987 	if (!sk)
1988 		return -ENOBUFS;
1989 
1990 	sock->ops = &xsk_proto_ops;
1991 
1992 	sock_init_data(sock, sk);
1993 
1994 	sk->sk_family = PF_XDP;
1995 
1996 	sk->sk_destruct = xsk_destruct;
1997 
1998 	sock_set_flag(sk, SOCK_RCU_FREE);
1999 
2000 	xs = xdp_sk(sk);
2001 	xs->state = XSK_READY;
2002 	xs->max_tx_budget = TX_BATCH_SIZE;
2003 	mutex_init(&xs->mutex);
2004 
2005 	INIT_LIST_HEAD(&xs->map_list);
2006 	spin_lock_init(&xs->map_list_lock);
2007 
2008 	mutex_lock(&net->xdp.lock);
2009 	sk_add_node_rcu(sk, &net->xdp.list);
2010 	mutex_unlock(&net->xdp.lock);
2011 
2012 	sock_prot_inuse_add(net, &xsk_proto, 1);
2013 
2014 	return 0;
2015 }
2016 
2017 static const struct net_proto_family xsk_family_ops = {
2018 	.family = PF_XDP,
2019 	.create = xsk_create,
2020 	.owner	= THIS_MODULE,
2021 };
2022 
2023 static struct notifier_block xsk_netdev_notifier = {
2024 	.notifier_call	= xsk_notifier,
2025 };
2026 
2027 static int __net_init xsk_net_init(struct net *net)
2028 {
2029 	mutex_init(&net->xdp.lock);
2030 	INIT_HLIST_HEAD(&net->xdp.list);
2031 	return 0;
2032 }
2033 
2034 static void __net_exit xsk_net_exit(struct net *net)
2035 {
2036 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
2037 }
2038 
2039 static struct pernet_operations xsk_net_ops = {
2040 	.init = xsk_net_init,
2041 	.exit = xsk_net_exit,
2042 };
2043 
2044 static int __init xsk_init(void)
2045 {
2046 	int err;
2047 
2048 	err = proto_register(&xsk_proto, 0 /* no slab */);
2049 	if (err)
2050 		goto out;
2051 
2052 	err = sock_register(&xsk_family_ops);
2053 	if (err)
2054 		goto out_proto;
2055 
2056 	err = register_pernet_subsys(&xsk_net_ops);
2057 	if (err)
2058 		goto out_sk;
2059 
2060 	err = register_netdevice_notifier(&xsk_netdev_notifier);
2061 	if (err)
2062 		goto out_pernet;
2063 
2064 	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
2065 						 sizeof(struct xsk_addrs),
2066 						 0, SLAB_HWCACHE_ALIGN, NULL);
2067 	if (!xsk_tx_generic_cache) {
2068 		err = -ENOMEM;
2069 		goto out_unreg_notif;
2070 	}
2071 
2072 	return 0;
2073 
2074 out_unreg_notif:
2075 	unregister_netdevice_notifier(&xsk_netdev_notifier);
2076 out_pernet:
2077 	unregister_pernet_subsys(&xsk_net_ops);
2078 out_sk:
2079 	sock_unregister(PF_XDP);
2080 out_proto:
2081 	proto_unregister(&xsk_proto);
2082 out:
2083 	return err;
2084 }
2085 
2086 fs_initcall(xsk_init);
2087