xref: /linux/drivers/net/netkit.c (revision 90e63d5354951d37fa2b3b91e6f17b95d2bf9bee)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2023 Isovalent */
3 
4 #include <linux/netdevice.h>
5 #include <linux/ethtool.h>
6 #include <linux/etherdevice.h>
7 #include <linux/filter.h>
8 #include <linux/netfilter_netdev.h>
9 #include <linux/bpf_mprog.h>
10 #include <linux/indirect_call_wrapper.h>
11 
12 #include <net/netdev_lock.h>
13 #include <net/netdev_queues.h>
14 #include <net/netdev_rx_queue.h>
15 #include <net/xdp_sock_drv.h>
16 #include <net/netkit.h>
17 #include <net/dst.h>
18 #include <net/tcx.h>
19 
20 #define NETKIT_DRV_NAME	"netkit"
21 
22 #define NETKIT_NUM_RX_QUEUES_MAX  1024
23 #define NETKIT_NUM_TX_QUEUES_MAX  1
24 
25 #define NETKIT_NUM_RX_QUEUES_REAL 1
26 #define NETKIT_NUM_TX_QUEUES_REAL 1
27 
28 struct netkit {
29 	__cacheline_group_begin(netkit_fastpath);
30 	struct net_device __rcu *peer;
31 	struct bpf_mprog_entry __rcu *active;
32 	enum netkit_action policy;
33 	enum netkit_scrub scrub;
34 	struct bpf_mprog_bundle	bundle;
35 	__cacheline_group_end(netkit_fastpath);
36 
37 	__cacheline_group_begin(netkit_slowpath);
38 	enum netkit_mode mode;
39 	enum netkit_pairing pair;
40 	bool primary;
41 	u32 headroom;
42 	__cacheline_group_end(netkit_slowpath);
43 };
44 
45 struct netkit_link {
46 	struct bpf_link link;
47 	struct net_device *dev;
48 };
49 
50 static struct rtnl_link_ops netkit_link_ops;
51 
52 static __always_inline int
53 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
54 	   enum netkit_action ret)
55 {
56 	const struct bpf_mprog_fp *fp;
57 	const struct bpf_prog *prog;
58 
59 	bpf_mprog_foreach_prog(entry, fp, prog) {
60 		bpf_compute_data_pointers(skb);
61 		ret = bpf_prog_run(prog, skb);
62 		if (ret != NETKIT_NEXT)
63 			break;
64 	}
65 	return ret;
66 }
67 
68 static void netkit_xnet(struct sk_buff *skb)
69 {
70 	skb->priority = 0;
71 	skb->mark = 0;
72 }
73 
74 static void netkit_prep_forward(struct sk_buff *skb,
75 				bool xnet, bool xnet_scrub)
76 {
77 	skb_scrub_packet(skb, false);
78 	nf_skip_egress(skb, true);
79 	skb_reset_mac_header(skb);
80 	if (!xnet)
81 		return;
82 	skb_clear_tstamp(skb);
83 	if (xnet_scrub)
84 		netkit_xnet(skb);
85 }
86 
87 static struct netkit *netkit_priv(const struct net_device *dev)
88 {
89 	return netdev_priv(dev);
90 }
91 
92 static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
93 {
94 	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
95 	struct netkit *nk = netkit_priv(dev);
96 	enum netkit_action ret = READ_ONCE(nk->policy);
97 	netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
98 	const struct bpf_mprog_entry *entry;
99 	struct net_device *peer;
100 	int len = skb->len;
101 
102 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
103 	rcu_read_lock();
104 	peer = rcu_dereference(nk->peer);
105 	if (unlikely(!peer || !(peer->flags & IFF_UP) ||
106 		     !pskb_may_pull(skb, ETH_HLEN) ||
107 		     skb_orphan_frags(skb, GFP_ATOMIC)))
108 		goto drop;
109 	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)),
110 			    nk->scrub);
111 	eth_skb_pkt_type(skb, peer);
112 	skb->dev = peer;
113 	entry = rcu_dereference(nk->active);
114 	if (entry)
115 		ret = netkit_run(entry, skb, ret);
116 	switch (ret) {
117 	case NETKIT_NEXT:
118 	case NETKIT_PASS:
119 		eth_skb_pull_mac(skb);
120 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
121 		if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
122 			dev_sw_netstats_tx_add(dev, 1, len);
123 			dev_sw_netstats_rx_add(peer, len);
124 		} else {
125 			goto drop_stats;
126 		}
127 		break;
128 	case NETKIT_REDIRECT:
129 		dev_sw_netstats_tx_add(dev, 1, len);
130 		skb_do_redirect(skb);
131 		break;
132 	case NETKIT_DROP:
133 	default:
134 drop:
135 		kfree_skb(skb);
136 drop_stats:
137 		dev_core_stats_tx_dropped_inc(dev);
138 		ret_dev = NET_XMIT_DROP;
139 		break;
140 	}
141 	rcu_read_unlock();
142 	bpf_net_ctx_clear(bpf_net_ctx);
143 	return ret_dev;
144 }
145 
146 static int netkit_open(struct net_device *dev)
147 {
148 	struct netkit *nk = netkit_priv(dev);
149 	struct net_device *peer = rtnl_dereference(nk->peer);
150 
151 	if (nk->pair == NETKIT_DEVICE_SINGLE) {
152 		netif_carrier_on(dev);
153 		return 0;
154 	}
155 	if (!peer)
156 		return -ENOTCONN;
157 	if (peer->flags & IFF_UP) {
158 		netif_carrier_on(dev);
159 		netif_carrier_on(peer);
160 	}
161 	return 0;
162 }
163 
164 static int netkit_close(struct net_device *dev)
165 {
166 	struct netkit *nk = netkit_priv(dev);
167 	struct net_device *peer = rtnl_dereference(nk->peer);
168 
169 	netif_carrier_off(dev);
170 	if (peer)
171 		netif_carrier_off(peer);
172 	return 0;
173 }
174 
175 static int netkit_get_iflink(const struct net_device *dev)
176 {
177 	struct netkit *nk = netkit_priv(dev);
178 	struct net_device *peer;
179 	int iflink = 0;
180 
181 	rcu_read_lock();
182 	peer = rcu_dereference(nk->peer);
183 	if (peer)
184 		iflink = READ_ONCE(peer->ifindex);
185 	rcu_read_unlock();
186 	return iflink;
187 }
188 
189 static int netkit_set_multicast(struct net_device *dev,
190 				struct netdev_hw_addr_list *uc,
191 				struct netdev_hw_addr_list *mc)
192 {
193 	/* Nothing to do, we receive whatever gets pushed to us! */
194 	return 0;
195 }
196 
197 static int netkit_set_macaddr(struct net_device *dev, void *sa)
198 {
199 	struct netkit *nk = netkit_priv(dev);
200 
201 	if (nk->mode != NETKIT_L2)
202 		return -EOPNOTSUPP;
203 
204 	return eth_mac_addr(dev, sa);
205 }
206 
207 static void netkit_set_headroom(struct net_device *dev, int headroom)
208 {
209 	struct netkit *nk = netkit_priv(dev), *nk2;
210 	struct net_device *peer;
211 
212 	if (headroom < 0)
213 		headroom = NET_SKB_PAD;
214 
215 	rcu_read_lock();
216 	peer = rcu_dereference(nk->peer);
217 	if (!peer) {
218 		nk->headroom = headroom;
219 		dev->needed_headroom = headroom;
220 	} else {
221 		nk2 = netkit_priv(peer);
222 		nk->headroom = headroom;
223 		headroom = max(nk->headroom, nk2->headroom);
224 
225 		peer->needed_headroom = headroom;
226 		dev->needed_headroom = headroom;
227 	}
228 	rcu_read_unlock();
229 }
230 
231 INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev)
232 {
233 	return rcu_dereference(netkit_priv(dev)->peer);
234 }
235 
236 static void netkit_get_stats(struct net_device *dev,
237 			     struct rtnl_link_stats64 *stats)
238 {
239 	dev_fetch_sw_netstats(stats, dev->tstats);
240 	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
241 }
242 
243 static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
244 {
245 	if (!dev->netdev_ops->ndo_bpf ||
246 	    !dev->netdev_ops->ndo_xdp_xmit ||
247 	    !dev->netdev_ops->ndo_xsk_wakeup)
248 		return false;
249 	return true;
250 }
251 
252 static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
253 {
254 	struct netkit *nk = netkit_priv(dev);
255 	struct netdev_bpf xdp_lower;
256 	struct netdev_rx_queue *rxq;
257 	struct net_device *phys;
258 	bool create = false;
259 	int ret = -EBUSY;
260 
261 	switch (xdp->command) {
262 	case XDP_SETUP_XSK_POOL:
263 		if (nk->pair == NETKIT_DEVICE_PAIR)
264 			return -EOPNOTSUPP;
265 		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
266 			return -EINVAL;
267 
268 		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
269 		if (!rxq->lease)
270 			return -EOPNOTSUPP;
271 
272 		phys = rxq->lease->dev;
273 		if (!netkit_xsk_supported_at_phys(phys))
274 			return -EOPNOTSUPP;
275 
276 		create = xdp->xsk.pool;
277 		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
278 		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease);
279 		break;
280 	case XDP_SETUP_PROG:
281 		return -EOPNOTSUPP;
282 	default:
283 		return -EINVAL;
284 	}
285 
286 	netdev_lock(phys);
287 	if (create &&
288 	    (phys->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) {
289 		ret = -EOPNOTSUPP;
290 		goto out;
291 	}
292 	if (!create || !dev_get_min_mp_channel_count(phys))
293 		ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
294 out:
295 	netdev_unlock(phys);
296 	return ret;
297 }
298 
299 static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
300 {
301 	struct netdev_rx_queue *rxq, *rxq_lease;
302 	struct net_device *phys;
303 
304 	if (queue_id >= dev->real_num_rx_queues)
305 		return -EINVAL;
306 
307 	rxq = __netif_get_rx_queue(dev, queue_id);
308 	rxq_lease = READ_ONCE(rxq->lease);
309 	if (unlikely(!rxq_lease))
310 		return -EOPNOTSUPP;
311 
312 	/* netkit_xsk already validated full xsk support, hence it's
313 	 * fine to call into ndo_xsk_wakeup right away given this
314 	 * was a prerequisite to get here in the first place. The
315 	 * phys xsk support cannot change without tearing down the
316 	 * device (which clears the lease first).
317 	 */
318 	phys = rxq_lease->dev;
319 	return phys->netdev_ops->ndo_xsk_wakeup(phys,
320 			get_netdev_rx_queue_index(rxq_lease), flags);
321 }
322 
323 static int netkit_init(struct net_device *dev)
324 {
325 	netdev_lockdep_set_classes(dev);
326 	return 0;
327 }
328 
329 static void netkit_uninit(struct net_device *dev);
330 
331 static const struct net_device_ops netkit_netdev_ops = {
332 	.ndo_init		= netkit_init,
333 	.ndo_open		= netkit_open,
334 	.ndo_stop		= netkit_close,
335 	.ndo_start_xmit		= netkit_xmit,
336 	.ndo_set_rx_mode_async	= netkit_set_multicast,
337 	.ndo_set_rx_headroom	= netkit_set_headroom,
338 	.ndo_set_mac_address	= netkit_set_macaddr,
339 	.ndo_get_iflink		= netkit_get_iflink,
340 	.ndo_get_peer_dev	= netkit_peer_dev,
341 	.ndo_get_stats64	= netkit_get_stats,
342 	.ndo_uninit		= netkit_uninit,
343 	.ndo_bpf		= netkit_xsk,
344 	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
345 	.ndo_features_check	= passthru_features_check,
346 };
347 
348 static void netkit_get_drvinfo(struct net_device *dev,
349 			       struct ethtool_drvinfo *info)
350 {
351 	strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver));
352 }
353 
354 static const struct ethtool_ops netkit_ethtool_ops = {
355 	.get_drvinfo		= netkit_get_drvinfo,
356 };
357 
358 static int netkit_queue_create(struct net_device *dev,
359 			       struct netlink_ext_ack *extack)
360 {
361 	struct netkit *nk = netkit_priv(dev);
362 	u32 rxq_count_old, rxq_count_new;
363 	int err;
364 
365 	rxq_count_old = dev->real_num_rx_queues;
366 	rxq_count_new = rxq_count_old + 1;
367 
368 	/* In paired mode, only the non-primary (peer) device can
369 	 * create leased queues since the primary is the management
370 	 * side. In single device mode, leasing is always allowed.
371 	 */
372 	if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) {
373 		NL_SET_ERR_MSG(extack,
374 			       "netkit can only lease against the peer device");
375 		return -EOPNOTSUPP;
376 	}
377 
378 	err = netif_set_real_num_rx_queues(dev, rxq_count_new);
379 	if (err) {
380 		if (rxq_count_new > dev->num_rx_queues)
381 			NL_SET_ERR_MSG(extack,
382 				       "netkit maximum queue limit reached");
383 		else
384 			NL_SET_ERR_MSG_FMT(extack,
385 					   "netkit cannot create more queues err=%d", err);
386 		return err;
387 	}
388 
389 	return rxq_count_old;
390 }
391 
392 static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = {
393 	.ndo_queue_create	= netkit_queue_create,
394 };
395 
396 static struct net_device *netkit_alloc(struct nlattr *tb[],
397 				       const char *ifname,
398 				       unsigned char name_assign_type,
399 				       unsigned int num_tx_queues,
400 				       unsigned int num_rx_queues)
401 {
402 	const struct rtnl_link_ops *ops = &netkit_link_ops;
403 	struct net_device *dev;
404 
405 	if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX ||
406 	    num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX)
407 		return ERR_PTR(-EOPNOTSUPP);
408 
409 	dev = alloc_netdev_mqs(ops->priv_size, ifname,
410 			       name_assign_type, ops->setup,
411 			       num_tx_queues, num_rx_queues);
412 	if (dev) {
413 		dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL;
414 		dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL;
415 	}
416 	return dev;
417 }
418 
419 static void netkit_queue_unlease(struct net_device *dev)
420 {
421 	struct netdev_rx_queue *rxq, *rxq_lease;
422 	struct net_device *dev_lease;
423 	int i;
424 
425 	if (dev->real_num_rx_queues == 1)
426 		return;
427 
428 	netdev_lock(dev);
429 	for (i = 1; i < dev->real_num_rx_queues; i++) {
430 		rxq = __netif_get_rx_queue(dev, i);
431 		rxq_lease = rxq->lease;
432 		dev_lease = rxq_lease->dev;
433 
434 		netdev_lock(dev_lease);
435 		netdev_rx_queue_unlease(rxq, rxq_lease);
436 		netdev_unlock(dev_lease);
437 	}
438 	netdev_unlock(dev);
439 }
440 
441 static void netkit_setup(struct net_device *dev)
442 {
443 	static const netdev_features_t netkit_features_hw_vlan =
444 		NETIF_F_HW_VLAN_CTAG_TX |
445 		NETIF_F_HW_VLAN_CTAG_RX |
446 		NETIF_F_HW_VLAN_STAG_TX |
447 		NETIF_F_HW_VLAN_STAG_RX;
448 	static const netdev_features_t netkit_features =
449 		netkit_features_hw_vlan |
450 		NETIF_F_SG |
451 		NETIF_F_FRAGLIST |
452 		NETIF_F_HW_CSUM |
453 		NETIF_F_RXCSUM |
454 		NETIF_F_SCTP_CRC |
455 		NETIF_F_HIGHDMA |
456 		NETIF_F_GSO_SOFTWARE |
457 		NETIF_F_GSO_ENCAP_ALL;
458 
459 	ether_setup(dev);
460 	dev->max_mtu = ETH_MAX_MTU;
461 	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
462 
463 	dev->flags |= IFF_NOARP;
464 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
465 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
466 	dev->priv_flags |= IFF_PHONY_HEADROOM;
467 	dev->priv_flags |= IFF_NO_QUEUE;
468 	dev->priv_flags |= IFF_DISABLE_NETPOLL;
469 	dev->lltx = true;
470 	dev->netmem_tx = NETMEM_TX_NO_DMA;
471 
472 	dev->netdev_ops     = &netkit_netdev_ops;
473 	dev->ethtool_ops    = &netkit_ethtool_ops;
474 	dev->queue_mgmt_ops = &netkit_queue_mgmt_ops;
475 
476 	dev->features |= netkit_features;
477 	dev->hw_features = netkit_features;
478 	dev->hw_enc_features = netkit_features;
479 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
480 	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
481 
482 	dev->needs_free_netdev = true;
483 
484 	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
485 }
486 
487 static struct net *netkit_get_link_net(const struct net_device *dev)
488 {
489 	struct netkit *nk = netkit_priv(dev);
490 	struct net_device *peer = rtnl_dereference(nk->peer);
491 
492 	return peer ? dev_net(peer) : dev_net(dev);
493 }
494 
495 static int netkit_check_policy(int policy, struct nlattr *tb,
496 			       struct netlink_ext_ack *extack)
497 {
498 	switch (policy) {
499 	case NETKIT_PASS:
500 	case NETKIT_DROP:
501 		return 0;
502 	default:
503 		NL_SET_ERR_MSG_ATTR(extack, tb,
504 				    "Provided default xmit policy not supported");
505 		return -EINVAL;
506 	}
507 }
508 
509 static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
510 			   struct netlink_ext_ack *extack)
511 {
512 	struct nlattr *attr = tb[IFLA_ADDRESS];
513 
514 	if (!attr)
515 		return 0;
516 	if (nla_len(attr) != ETH_ALEN)
517 		return -EINVAL;
518 	if (!is_valid_ether_addr(nla_data(attr)))
519 		return -EADDRNOTAVAIL;
520 	return 0;
521 }
522 
523 static int netkit_new_link(struct net_device *dev,
524 			   struct rtnl_newlink_params *params,
525 			   struct netlink_ext_ack *extack)
526 {
527 	struct net *peer_net = rtnl_newlink_peer_net(params);
528 	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
529 	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
530 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
531 	enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
532 	enum netkit_action policy_prim = NETKIT_PASS;
533 	enum netkit_action policy_peer = NETKIT_PASS;
534 	bool seen_peer = false, seen_scrub = false;
535 	struct nlattr **data = params->data;
536 	enum netkit_mode mode = NETKIT_L3;
537 	unsigned char ifname_assign_type;
538 	struct nlattr **tb = params->tb;
539 	u16 headroom = 0, tailroom = 0;
540 	struct ifinfomsg *ifmp = NULL;
541 	struct net_device *peer = NULL;
542 	char ifname[IFNAMSIZ];
543 	struct netkit *nk;
544 	int err;
545 
546 	tbp = tb;
547 	if (data) {
548 		if (data[IFLA_NETKIT_MODE])
549 			mode = nla_get_u32(data[IFLA_NETKIT_MODE]);
550 		if (data[IFLA_NETKIT_PEER_INFO]) {
551 			attr = data[IFLA_NETKIT_PEER_INFO];
552 			ifmp = nla_data(attr);
553 			rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
554 			tbp = peer_tb;
555 		}
556 		if (data[IFLA_NETKIT_SCRUB])
557 			scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]);
558 		if (data[IFLA_NETKIT_PEER_SCRUB])
559 			scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]);
560 		if (data[IFLA_NETKIT_POLICY]) {
561 			attr = data[IFLA_NETKIT_POLICY];
562 			policy_prim = nla_get_u32(attr);
563 			err = netkit_check_policy(policy_prim, attr, extack);
564 			if (err < 0)
565 				return err;
566 		}
567 		if (data[IFLA_NETKIT_PEER_POLICY]) {
568 			attr = data[IFLA_NETKIT_PEER_POLICY];
569 			policy_peer = nla_get_u32(attr);
570 			err = netkit_check_policy(policy_peer, attr, extack);
571 			if (err < 0)
572 				return err;
573 		}
574 		if (data[IFLA_NETKIT_HEADROOM])
575 			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
576 		if (data[IFLA_NETKIT_TAILROOM])
577 			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
578 		if (data[IFLA_NETKIT_PAIRING])
579 			pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
580 
581 		seen_scrub = data[IFLA_NETKIT_SCRUB];
582 		seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
583 			    data[IFLA_NETKIT_PEER_SCRUB] ||
584 			    data[IFLA_NETKIT_PEER_POLICY];
585 	}
586 
587 	if (ifmp && tbp[IFLA_IFNAME]) {
588 		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
589 		ifname_assign_type = NET_NAME_USER;
590 	} else {
591 		strscpy(ifname, "nk%d", IFNAMSIZ);
592 		ifname_assign_type = NET_NAME_ENUM;
593 	}
594 	if (mode != NETKIT_L2 &&
595 	    (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
596 		return -EOPNOTSUPP;
597 	if (pair == NETKIT_DEVICE_SINGLE &&
598 	    (tb != tbp || seen_peer || seen_scrub ||
599 	     policy_prim != NETKIT_PASS))
600 		return -EOPNOTSUPP;
601 
602 	if (pair == NETKIT_DEVICE_PAIR) {
603 		peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
604 					&netkit_link_ops, tbp, extack);
605 		if (IS_ERR(peer))
606 			return PTR_ERR(peer);
607 
608 		netif_inherit_tso_max(peer, dev);
609 		if (headroom)
610 			peer->needed_headroom = headroom;
611 		if (tailroom)
612 			peer->needed_tailroom = tailroom;
613 		if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
614 			eth_hw_addr_random(peer);
615 		if (ifmp && dev->ifindex)
616 			peer->ifindex = ifmp->ifi_index;
617 
618 		nk = netkit_priv(peer);
619 		nk->primary = false;
620 		nk->policy = policy_peer;
621 		nk->scrub = scrub_peer;
622 		nk->mode = mode;
623 		nk->pair = pair;
624 		nk->headroom = headroom;
625 		bpf_mprog_bundle_init(&nk->bundle);
626 
627 		err = register_netdevice(peer);
628 		if (err < 0)
629 			goto err_register_peer;
630 		netif_carrier_off(peer);
631 		if (mode == NETKIT_L2)
632 			dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
633 
634 		err = rtnl_configure_link(peer, NULL, 0, NULL);
635 		if (err < 0)
636 			goto err_configure_peer;
637 	}
638 
639 	if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
640 		eth_hw_addr_random(dev);
641 	if (tb[IFLA_IFNAME])
642 		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
643 	else
644 		strscpy(dev->name, "nk%d", IFNAMSIZ);
645 	if (headroom)
646 		dev->needed_headroom = headroom;
647 	if (tailroom)
648 		dev->needed_tailroom = tailroom;
649 
650 	nk = netkit_priv(dev);
651 	nk->primary = true;
652 	nk->policy = policy_prim;
653 	nk->scrub = scrub_prim;
654 	nk->mode = mode;
655 	nk->pair = pair;
656 	nk->headroom = headroom;
657 	bpf_mprog_bundle_init(&nk->bundle);
658 
659 	if (pair == NETKIT_DEVICE_SINGLE)
660 		xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
661 
662 	err = register_netdevice(dev);
663 	if (err < 0)
664 		goto err_configure_peer;
665 	netif_carrier_off(dev);
666 	if (mode == NETKIT_L2)
667 		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
668 
669 	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
670 	if (peer)
671 		rcu_assign_pointer(netkit_priv(peer)->peer, dev);
672 	return 0;
673 err_configure_peer:
674 	if (peer)
675 		unregister_netdevice(peer);
676 	return err;
677 err_register_peer:
678 	free_netdev(peer);
679 	return err;
680 }
681 
682 static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
683 						  bool bundle_fallback)
684 {
685 	struct netkit *nk = netkit_priv(dev);
686 	struct bpf_mprog_entry *entry;
687 
688 	ASSERT_RTNL();
689 	entry = rcu_dereference_rtnl(nk->active);
690 	if (entry)
691 		return entry;
692 	if (bundle_fallback)
693 		return &nk->bundle.a;
694 	return NULL;
695 }
696 
697 static void netkit_entry_update(struct net_device *dev,
698 				struct bpf_mprog_entry *entry)
699 {
700 	struct netkit *nk = netkit_priv(dev);
701 
702 	ASSERT_RTNL();
703 	rcu_assign_pointer(nk->active, entry);
704 }
705 
706 static void netkit_entry_sync(void)
707 {
708 	synchronize_rcu();
709 }
710 
711 static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
712 {
713 	struct net_device *dev;
714 	struct netkit *nk;
715 
716 	ASSERT_RTNL();
717 
718 	switch (which) {
719 	case BPF_NETKIT_PRIMARY:
720 	case BPF_NETKIT_PEER:
721 		break;
722 	default:
723 		return ERR_PTR(-EINVAL);
724 	}
725 
726 	dev = __dev_get_by_index(net, ifindex);
727 	if (!dev)
728 		return ERR_PTR(-ENODEV);
729 	if (dev->netdev_ops != &netkit_netdev_ops)
730 		return ERR_PTR(-ENXIO);
731 
732 	nk = netkit_priv(dev);
733 	if (!nk->primary)
734 		return ERR_PTR(-EACCES);
735 	if (nk->pair == NETKIT_DEVICE_SINGLE)
736 		return ERR_PTR(-EOPNOTSUPP);
737 	if (which == BPF_NETKIT_PEER) {
738 		dev = rcu_dereference_rtnl(nk->peer);
739 		if (!dev)
740 			return ERR_PTR(-ENODEV);
741 	}
742 	return dev;
743 }
744 
745 int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
746 {
747 	struct bpf_mprog_entry *entry, *entry_new;
748 	struct bpf_prog *replace_prog = NULL;
749 	struct net_device *dev;
750 	int ret;
751 
752 	rtnl_lock();
753 	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
754 			       attr->attach_type);
755 	if (IS_ERR(dev)) {
756 		ret = PTR_ERR(dev);
757 		goto out;
758 	}
759 	entry = netkit_entry_fetch(dev, true);
760 	if (attr->attach_flags & BPF_F_REPLACE) {
761 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
762 						 prog->type);
763 		if (IS_ERR(replace_prog)) {
764 			ret = PTR_ERR(replace_prog);
765 			replace_prog = NULL;
766 			goto out;
767 		}
768 	}
769 	ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
770 			       attr->attach_flags, attr->relative_fd,
771 			       attr->expected_revision);
772 	if (!ret) {
773 		if (entry != entry_new) {
774 			netkit_entry_update(dev, entry_new);
775 			netkit_entry_sync();
776 		}
777 		bpf_mprog_commit(entry);
778 	}
779 out:
780 	if (replace_prog)
781 		bpf_prog_put(replace_prog);
782 	rtnl_unlock();
783 	return ret;
784 }
785 
786 int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
787 {
788 	struct bpf_mprog_entry *entry, *entry_new;
789 	struct net_device *dev;
790 	int ret;
791 
792 	rtnl_lock();
793 	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
794 			       attr->attach_type);
795 	if (IS_ERR(dev)) {
796 		ret = PTR_ERR(dev);
797 		goto out;
798 	}
799 	entry = netkit_entry_fetch(dev, false);
800 	if (!entry) {
801 		ret = -ENOENT;
802 		goto out;
803 	}
804 	ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
805 			       attr->relative_fd, attr->expected_revision);
806 	if (!ret) {
807 		if (!bpf_mprog_total(entry_new))
808 			entry_new = NULL;
809 		netkit_entry_update(dev, entry_new);
810 		netkit_entry_sync();
811 		bpf_mprog_commit(entry);
812 	}
813 out:
814 	rtnl_unlock();
815 	return ret;
816 }
817 
818 int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
819 {
820 	struct net_device *dev;
821 	int ret;
822 
823 	rtnl_lock();
824 	dev = netkit_dev_fetch(current->nsproxy->net_ns,
825 			       attr->query.target_ifindex,
826 			       attr->query.attach_type);
827 	if (IS_ERR(dev)) {
828 		ret = PTR_ERR(dev);
829 		goto out;
830 	}
831 	ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
832 out:
833 	rtnl_unlock();
834 	return ret;
835 }
836 
837 static struct netkit_link *netkit_link(const struct bpf_link *link)
838 {
839 	return container_of(link, struct netkit_link, link);
840 }
841 
842 static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
843 				   u32 id_or_fd, u64 revision)
844 {
845 	struct netkit_link *nkl = netkit_link(link);
846 	struct bpf_mprog_entry *entry, *entry_new;
847 	struct net_device *dev = nkl->dev;
848 	int ret;
849 
850 	ASSERT_RTNL();
851 	entry = netkit_entry_fetch(dev, true);
852 	ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
853 			       id_or_fd, revision);
854 	if (!ret) {
855 		if (entry != entry_new) {
856 			netkit_entry_update(dev, entry_new);
857 			netkit_entry_sync();
858 		}
859 		bpf_mprog_commit(entry);
860 	}
861 	return ret;
862 }
863 
864 static void netkit_link_release(struct bpf_link *link)
865 {
866 	struct netkit_link *nkl = netkit_link(link);
867 	struct bpf_mprog_entry *entry, *entry_new;
868 	struct net_device *dev;
869 	int ret = 0;
870 
871 	rtnl_lock();
872 	dev = nkl->dev;
873 	if (!dev)
874 		goto out;
875 	entry = netkit_entry_fetch(dev, false);
876 	if (!entry) {
877 		ret = -ENOENT;
878 		goto out;
879 	}
880 	ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
881 	if (!ret) {
882 		if (!bpf_mprog_total(entry_new))
883 			entry_new = NULL;
884 		netkit_entry_update(dev, entry_new);
885 		netkit_entry_sync();
886 		bpf_mprog_commit(entry);
887 		nkl->dev = NULL;
888 	}
889 out:
890 	WARN_ON_ONCE(ret);
891 	rtnl_unlock();
892 }
893 
894 static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
895 			      struct bpf_prog *oprog)
896 {
897 	struct netkit_link *nkl = netkit_link(link);
898 	struct bpf_mprog_entry *entry, *entry_new;
899 	struct net_device *dev;
900 	int ret = 0;
901 
902 	rtnl_lock();
903 	dev = nkl->dev;
904 	if (!dev) {
905 		ret = -ENOLINK;
906 		goto out;
907 	}
908 	if (oprog && link->prog != oprog) {
909 		ret = -EPERM;
910 		goto out;
911 	}
912 	oprog = link->prog;
913 	if (oprog == nprog) {
914 		bpf_prog_put(nprog);
915 		goto out;
916 	}
917 	entry = netkit_entry_fetch(dev, false);
918 	if (!entry) {
919 		ret = -ENOENT;
920 		goto out;
921 	}
922 	ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
923 			       BPF_F_REPLACE | BPF_F_ID,
924 			       link->prog->aux->id, 0);
925 	if (!ret) {
926 		WARN_ON_ONCE(entry != entry_new);
927 		oprog = xchg(&link->prog, nprog);
928 		bpf_prog_put(oprog);
929 		bpf_mprog_commit(entry);
930 	}
931 out:
932 	rtnl_unlock();
933 	return ret;
934 }
935 
936 static void netkit_link_dealloc(struct bpf_link *link)
937 {
938 	kfree(netkit_link(link));
939 }
940 
941 static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
942 {
943 	const struct netkit_link *nkl = netkit_link(link);
944 	u32 ifindex = 0;
945 
946 	rtnl_lock();
947 	if (nkl->dev)
948 		ifindex = nkl->dev->ifindex;
949 	rtnl_unlock();
950 
951 	seq_printf(seq, "ifindex:\t%u\n", ifindex);
952 	seq_printf(seq, "attach_type:\t%u (%s)\n",
953 		   link->attach_type,
954 		   link->attach_type == BPF_NETKIT_PRIMARY ? "primary" : "peer");
955 }
956 
957 static int netkit_link_fill_info(const struct bpf_link *link,
958 				 struct bpf_link_info *info)
959 {
960 	const struct netkit_link *nkl = netkit_link(link);
961 	u32 ifindex = 0;
962 
963 	rtnl_lock();
964 	if (nkl->dev)
965 		ifindex = nkl->dev->ifindex;
966 	rtnl_unlock();
967 
968 	info->netkit.ifindex = ifindex;
969 	info->netkit.attach_type = link->attach_type;
970 	return 0;
971 }
972 
973 static int netkit_link_detach(struct bpf_link *link)
974 {
975 	netkit_link_release(link);
976 	return 0;
977 }
978 
979 static const struct bpf_link_ops netkit_link_lops = {
980 	.release	= netkit_link_release,
981 	.detach		= netkit_link_detach,
982 	.dealloc	= netkit_link_dealloc,
983 	.update_prog	= netkit_link_update,
984 	.show_fdinfo	= netkit_link_fdinfo,
985 	.fill_link_info	= netkit_link_fill_info,
986 };
987 
988 static int netkit_link_init(struct netkit_link *nkl,
989 			    struct bpf_link_primer *link_primer,
990 			    const union bpf_attr *attr,
991 			    struct net_device *dev,
992 			    struct bpf_prog *prog)
993 {
994 	bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
995 		      &netkit_link_lops, prog, attr->link_create.attach_type);
996 	nkl->dev = dev;
997 	return bpf_link_prime(&nkl->link, link_primer);
998 }
999 
1000 int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1001 {
1002 	struct bpf_link_primer link_primer;
1003 	struct netkit_link *nkl;
1004 	struct net_device *dev;
1005 	int ret;
1006 
1007 	rtnl_lock();
1008 	dev = netkit_dev_fetch(current->nsproxy->net_ns,
1009 			       attr->link_create.target_ifindex,
1010 			       attr->link_create.attach_type);
1011 	if (IS_ERR(dev)) {
1012 		ret = PTR_ERR(dev);
1013 		goto out;
1014 	}
1015 	nkl = kzalloc_obj(*nkl, GFP_KERNEL_ACCOUNT);
1016 	if (!nkl) {
1017 		ret = -ENOMEM;
1018 		goto out;
1019 	}
1020 	ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
1021 	if (ret) {
1022 		kfree(nkl);
1023 		goto out;
1024 	}
1025 	ret = netkit_link_prog_attach(&nkl->link,
1026 				      attr->link_create.flags,
1027 				      attr->link_create.netkit.relative_fd,
1028 				      attr->link_create.netkit.expected_revision);
1029 	if (ret) {
1030 		nkl->dev = NULL;
1031 		bpf_link_cleanup(&link_primer);
1032 		goto out;
1033 	}
1034 	ret = bpf_link_settle(&link_primer);
1035 out:
1036 	rtnl_unlock();
1037 	return ret;
1038 }
1039 
1040 static void netkit_release_all(struct net_device *dev)
1041 {
1042 	struct bpf_mprog_entry *entry;
1043 	struct bpf_tuple tuple = {};
1044 	struct bpf_mprog_fp *fp;
1045 	struct bpf_mprog_cp *cp;
1046 
1047 	entry = netkit_entry_fetch(dev, false);
1048 	if (!entry)
1049 		return;
1050 	netkit_entry_update(dev, NULL);
1051 	netkit_entry_sync();
1052 	bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
1053 		if (tuple.link)
1054 			netkit_link(tuple.link)->dev = NULL;
1055 		else
1056 			bpf_prog_put(tuple.prog);
1057 	}
1058 }
1059 
1060 static void netkit_uninit(struct net_device *dev)
1061 {
1062 	netkit_release_all(dev);
1063 	netkit_queue_unlease(dev);
1064 }
1065 
1066 static void netkit_del_link(struct net_device *dev, struct list_head *head)
1067 {
1068 	struct netkit *nk = netkit_priv(dev);
1069 	struct net_device *peer = rtnl_dereference(nk->peer);
1070 
1071 	RCU_INIT_POINTER(nk->peer, NULL);
1072 	unregister_netdevice_queue(dev, head);
1073 	if (peer) {
1074 		nk = netkit_priv(peer);
1075 		RCU_INIT_POINTER(nk->peer, NULL);
1076 		/* Guard against the peer already being in an unregister
1077 		 * list (e.g. same-namespace teardown where the peer is
1078 		 * in the caller's dev_kill_list). list_move_tail() on an
1079 		 * already-queued device would otherwise corrupt that
1080 		 * list's iteration. This situation can occur via netkit
1081 		 * notifier, hence guard against this scenario.
1082 		 */
1083 		if (!unregister_netdevice_queued(peer))
1084 			unregister_netdevice_queue(peer, head);
1085 	}
1086 }
1087 
1088 static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
1089 			      struct nlattr *data[],
1090 			      struct netlink_ext_ack *extack)
1091 {
1092 	struct netkit *nk = netkit_priv(dev);
1093 	struct net_device *peer = rtnl_dereference(nk->peer);
1094 	enum netkit_action policy;
1095 	struct nlattr *attr;
1096 	int err, i;
1097 	static const struct {
1098 		u32 attr;
1099 		char *name;
1100 	} fixed_params[] = {
1101 		{ IFLA_NETKIT_MODE,       "operating mode" },
1102 		{ IFLA_NETKIT_SCRUB,      "scrubbing" },
1103 		{ IFLA_NETKIT_PEER_SCRUB, "peer scrubbing" },
1104 		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
1105 		{ IFLA_NETKIT_HEADROOM,   "headroom" },
1106 		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
1107 		{ IFLA_NETKIT_PAIRING,    "pairing" },
1108 	};
1109 
1110 	if (!nk->primary) {
1111 		NL_SET_ERR_MSG(extack,
1112 			       "netkit link settings can be changed only through the primary device");
1113 		return -EACCES;
1114 	}
1115 
1116 	for (i = 0; i < ARRAY_SIZE(fixed_params); i++) {
1117 		attr = data[fixed_params[i].attr];
1118 		if (attr) {
1119 			NL_SET_ERR_MSG_ATTR_FMT(extack, attr,
1120 						"netkit link %s cannot be changed after device creation",
1121 						fixed_params[i].name);
1122 			return -EACCES;
1123 		}
1124 	}
1125 
1126 	if (data[IFLA_NETKIT_POLICY]) {
1127 		err = -EOPNOTSUPP;
1128 		attr = data[IFLA_NETKIT_POLICY];
1129 		policy = nla_get_u32(attr);
1130 		if (nk->pair == NETKIT_DEVICE_PAIR)
1131 			err = netkit_check_policy(policy, attr, extack);
1132 		if (err)
1133 			return err;
1134 		WRITE_ONCE(nk->policy, policy);
1135 	}
1136 
1137 	if (data[IFLA_NETKIT_PEER_POLICY]) {
1138 		err = -EOPNOTSUPP;
1139 		attr = data[IFLA_NETKIT_PEER_POLICY];
1140 		policy = nla_get_u32(attr);
1141 		if (peer)
1142 			err = netkit_check_policy(policy, attr, extack);
1143 		if (err)
1144 			return err;
1145 		nk = netkit_priv(peer);
1146 		WRITE_ONCE(nk->policy, policy);
1147 	}
1148 
1149 	return 0;
1150 }
1151 
1152 static void netkit_check_lease_unregister(struct net_device *dev)
1153 {
1154 	LIST_HEAD(list_kill);
1155 	u32 q_idx;
1156 
1157 	if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
1158 	    !dev->dev.parent)
1159 		return;
1160 
1161 	netdev_lock_ops(dev);
1162 	for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
1163 		struct net_device *tmp = dev;
1164 		struct netdev_rx_queue *rxq;
1165 		u32 tmp_q_idx = q_idx;
1166 
1167 		rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx,
1168 						 NETIF_PHYS_TO_VIRT);
1169 		if (rxq && tmp != dev &&
1170 		    tmp->netdev_ops == &netkit_netdev_ops) {
1171 			/* A single phys device can have multiple queues leased
1172 			 * to one netkit device. We can only queue that netkit
1173 			 * device once to the list_kill. Queues of that phys
1174 			 * device can be leased with different individual netkit
1175 			 * devices, hence we batch via list_kill.
1176 			 */
1177 			if (unregister_netdevice_queued(tmp))
1178 				continue;
1179 			netkit_del_link(tmp, &list_kill);
1180 		}
1181 	}
1182 	netdev_unlock_ops(dev);
1183 	unregister_netdevice_many(&list_kill);
1184 }
1185 
1186 static int netkit_notifier(struct notifier_block *this,
1187 			   unsigned long event, void *ptr)
1188 {
1189 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1190 
1191 	if (event == NETDEV_UNREGISTER)
1192 		netkit_check_lease_unregister(dev);
1193 	return NOTIFY_DONE;
1194 }
1195 
1196 static size_t netkit_get_size(const struct net_device *dev)
1197 {
1198 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
1199 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
1200 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */
1201 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */
1202 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
1203 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
1204 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
1205 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
1206 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
1207 	       0;
1208 }
1209 
1210 static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
1211 {
1212 	struct netkit *nk = netkit_priv(dev);
1213 	struct net_device *peer = rtnl_dereference(nk->peer);
1214 
1215 	if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
1216 		return -EMSGSIZE;
1217 	if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
1218 		return -EMSGSIZE;
1219 	if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
1220 		return -EMSGSIZE;
1221 	if (nk->pair == NETKIT_DEVICE_PAIR &&
1222 	    nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub))
1223 		return -EMSGSIZE;
1224 	if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom))
1225 		return -EMSGSIZE;
1226 	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
1227 		return -EMSGSIZE;
1228 	if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
1229 		return -EMSGSIZE;
1230 
1231 	if (peer) {
1232 		nk = netkit_priv(peer);
1233 		if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
1234 			return -EMSGSIZE;
1235 		if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub))
1236 			return -EMSGSIZE;
1237 	}
1238 
1239 	return 0;
1240 }
1241 
1242 static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
1243 	[IFLA_NETKIT_PEER_INFO]		= { .len = sizeof(struct ifinfomsg) },
1244 	[IFLA_NETKIT_MODE]		= NLA_POLICY_MAX(NLA_U32, NETKIT_L3),
1245 	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
1246 	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
1247 	[IFLA_NETKIT_HEADROOM]		= { .type = NLA_U16 },
1248 	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
1249 	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
1250 	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
1251 	[IFLA_NETKIT_PAIRING]		= NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
1252 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
1253 					    .reject_message = "Primary attribute is read-only" },
1254 };
1255 
1256 static struct rtnl_link_ops netkit_link_ops = {
1257 	.kind		= NETKIT_DRV_NAME,
1258 	.priv_size	= sizeof(struct netkit),
1259 	.alloc		= netkit_alloc,
1260 	.setup		= netkit_setup,
1261 	.newlink	= netkit_new_link,
1262 	.dellink	= netkit_del_link,
1263 	.changelink	= netkit_change_link,
1264 	.get_link_net	= netkit_get_link_net,
1265 	.get_size	= netkit_get_size,
1266 	.fill_info	= netkit_fill_info,
1267 	.policy		= netkit_policy,
1268 	.validate	= netkit_validate,
1269 	.peer_type	= IFLA_NETKIT_PEER_INFO,
1270 	.maxtype	= IFLA_NETKIT_MAX,
1271 };
1272 
1273 static struct notifier_block netkit_netdev_notifier = {
1274 	.notifier_call	= netkit_notifier,
1275 };
1276 
1277 static __init int netkit_mod_init(void)
1278 {
1279 	int ret;
1280 
1281 	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
1282 		     (int)NETKIT_PASS != (int)TCX_PASS ||
1283 		     (int)NETKIT_DROP != (int)TCX_DROP ||
1284 		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
1285 
1286 	ret = rtnl_link_register(&netkit_link_ops);
1287 	if (ret)
1288 		return ret;
1289 	ret = register_netdevice_notifier(&netkit_netdev_notifier);
1290 	if (ret)
1291 		rtnl_link_unregister(&netkit_link_ops);
1292 	return ret;
1293 }
1294 
1295 static __exit void netkit_mod_exit(void)
1296 {
1297 	unregister_netdevice_notifier(&netkit_netdev_notifier);
1298 	rtnl_link_unregister(&netkit_link_ops);
1299 }
1300 
1301 module_init(netkit_mod_init);
1302 module_exit(netkit_mod_exit);
1303 
1304 MODULE_DESCRIPTION("BPF-programmable network device");
1305 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
1306 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
1307 MODULE_LICENSE("GPL");
1308 MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
1309