xref: /linux/drivers/net/netkit.c (revision 335bbdf01d25517ae832ac1807fd8323c1f4f3b9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2023 Isovalent */
3 
4 #include <linux/netdevice.h>
5 #include <linux/ethtool.h>
6 #include <linux/etherdevice.h>
7 #include <linux/filter.h>
8 #include <linux/netfilter_netdev.h>
9 #include <linux/bpf_mprog.h>
10 
11 #include <net/netkit.h>
12 #include <net/dst.h>
13 #include <net/tcx.h>
14 
15 #define DRV_NAME "netkit"
16 
17 struct netkit {
18 	/* Needed in fast-path */
19 	struct net_device __rcu *peer;
20 	struct bpf_mprog_entry __rcu *active;
21 	enum netkit_action policy;
22 	struct bpf_mprog_bundle	bundle;
23 
24 	/* Needed in slow-path */
25 	enum netkit_mode mode;
26 	bool primary;
27 	u32 headroom;
28 };
29 
30 struct netkit_link {
31 	struct bpf_link link;
32 	struct net_device *dev;
33 	u32 location;
34 };
35 
36 static __always_inline int
37 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
38 	   enum netkit_action ret)
39 {
40 	const struct bpf_mprog_fp *fp;
41 	const struct bpf_prog *prog;
42 
43 	bpf_mprog_foreach_prog(entry, fp, prog) {
44 		bpf_compute_data_pointers(skb);
45 		ret = bpf_prog_run(prog, skb);
46 		if (ret != NETKIT_NEXT)
47 			break;
48 	}
49 	return ret;
50 }
51 
52 static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
53 {
54 	skb_scrub_packet(skb, xnet);
55 	skb->priority = 0;
56 	nf_skip_egress(skb, true);
57 }
58 
59 static struct netkit *netkit_priv(const struct net_device *dev)
60 {
61 	return netdev_priv(dev);
62 }
63 
64 static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
65 {
66 	struct netkit *nk = netkit_priv(dev);
67 	enum netkit_action ret = READ_ONCE(nk->policy);
68 	netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
69 	const struct bpf_mprog_entry *entry;
70 	struct net_device *peer;
71 
72 	rcu_read_lock();
73 	peer = rcu_dereference(nk->peer);
74 	if (unlikely(!peer || !(peer->flags & IFF_UP) ||
75 		     !pskb_may_pull(skb, ETH_HLEN) ||
76 		     skb_orphan_frags(skb, GFP_ATOMIC)))
77 		goto drop;
78 	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
79 	skb->dev = peer;
80 	entry = rcu_dereference(nk->active);
81 	if (entry)
82 		ret = netkit_run(entry, skb, ret);
83 	switch (ret) {
84 	case NETKIT_NEXT:
85 	case NETKIT_PASS:
86 		skb->protocol = eth_type_trans(skb, skb->dev);
87 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
88 		__netif_rx(skb);
89 		break;
90 	case NETKIT_REDIRECT:
91 		skb_do_redirect(skb);
92 		break;
93 	case NETKIT_DROP:
94 	default:
95 drop:
96 		kfree_skb(skb);
97 		dev_core_stats_tx_dropped_inc(dev);
98 		ret_dev = NET_XMIT_DROP;
99 		break;
100 	}
101 	rcu_read_unlock();
102 	return ret_dev;
103 }
104 
105 static int netkit_open(struct net_device *dev)
106 {
107 	struct netkit *nk = netkit_priv(dev);
108 	struct net_device *peer = rtnl_dereference(nk->peer);
109 
110 	if (!peer)
111 		return -ENOTCONN;
112 	if (peer->flags & IFF_UP) {
113 		netif_carrier_on(dev);
114 		netif_carrier_on(peer);
115 	}
116 	return 0;
117 }
118 
119 static int netkit_close(struct net_device *dev)
120 {
121 	struct netkit *nk = netkit_priv(dev);
122 	struct net_device *peer = rtnl_dereference(nk->peer);
123 
124 	netif_carrier_off(dev);
125 	if (peer)
126 		netif_carrier_off(peer);
127 	return 0;
128 }
129 
130 static int netkit_get_iflink(const struct net_device *dev)
131 {
132 	struct netkit *nk = netkit_priv(dev);
133 	struct net_device *peer;
134 	int iflink = 0;
135 
136 	rcu_read_lock();
137 	peer = rcu_dereference(nk->peer);
138 	if (peer)
139 		iflink = peer->ifindex;
140 	rcu_read_unlock();
141 	return iflink;
142 }
143 
144 static void netkit_set_multicast(struct net_device *dev)
145 {
146 	/* Nothing to do, we receive whatever gets pushed to us! */
147 }
148 
149 static void netkit_set_headroom(struct net_device *dev, int headroom)
150 {
151 	struct netkit *nk = netkit_priv(dev), *nk2;
152 	struct net_device *peer;
153 
154 	if (headroom < 0)
155 		headroom = NET_SKB_PAD;
156 
157 	rcu_read_lock();
158 	peer = rcu_dereference(nk->peer);
159 	if (unlikely(!peer))
160 		goto out;
161 
162 	nk2 = netkit_priv(peer);
163 	nk->headroom = headroom;
164 	headroom = max(nk->headroom, nk2->headroom);
165 
166 	peer->needed_headroom = headroom;
167 	dev->needed_headroom = headroom;
168 out:
169 	rcu_read_unlock();
170 }
171 
172 static struct net_device *netkit_peer_dev(struct net_device *dev)
173 {
174 	return rcu_dereference(netkit_priv(dev)->peer);
175 }
176 
177 static void netkit_uninit(struct net_device *dev);
178 
179 static const struct net_device_ops netkit_netdev_ops = {
180 	.ndo_open		= netkit_open,
181 	.ndo_stop		= netkit_close,
182 	.ndo_start_xmit		= netkit_xmit,
183 	.ndo_set_rx_mode	= netkit_set_multicast,
184 	.ndo_set_rx_headroom	= netkit_set_headroom,
185 	.ndo_get_iflink		= netkit_get_iflink,
186 	.ndo_get_peer_dev	= netkit_peer_dev,
187 	.ndo_uninit		= netkit_uninit,
188 	.ndo_features_check	= passthru_features_check,
189 };
190 
191 static void netkit_get_drvinfo(struct net_device *dev,
192 			       struct ethtool_drvinfo *info)
193 {
194 	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
195 }
196 
197 static const struct ethtool_ops netkit_ethtool_ops = {
198 	.get_drvinfo		= netkit_get_drvinfo,
199 };
200 
201 static void netkit_setup(struct net_device *dev)
202 {
203 	static const netdev_features_t netkit_features_hw_vlan =
204 		NETIF_F_HW_VLAN_CTAG_TX |
205 		NETIF_F_HW_VLAN_CTAG_RX |
206 		NETIF_F_HW_VLAN_STAG_TX |
207 		NETIF_F_HW_VLAN_STAG_RX;
208 	static const netdev_features_t netkit_features =
209 		netkit_features_hw_vlan |
210 		NETIF_F_SG |
211 		NETIF_F_FRAGLIST |
212 		NETIF_F_HW_CSUM |
213 		NETIF_F_RXCSUM |
214 		NETIF_F_SCTP_CRC |
215 		NETIF_F_HIGHDMA |
216 		NETIF_F_GSO_SOFTWARE |
217 		NETIF_F_GSO_ENCAP_ALL;
218 
219 	ether_setup(dev);
220 	dev->max_mtu = ETH_MAX_MTU;
221 
222 	dev->flags |= IFF_NOARP;
223 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
224 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
225 	dev->priv_flags |= IFF_PHONY_HEADROOM;
226 	dev->priv_flags |= IFF_NO_QUEUE;
227 
228 	dev->ethtool_ops = &netkit_ethtool_ops;
229 	dev->netdev_ops  = &netkit_netdev_ops;
230 
231 	dev->features |= netkit_features | NETIF_F_LLTX;
232 	dev->hw_features = netkit_features;
233 	dev->hw_enc_features = netkit_features;
234 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
235 	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
236 
237 	dev->needs_free_netdev = true;
238 
239 	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
240 }
241 
242 static struct net *netkit_get_link_net(const struct net_device *dev)
243 {
244 	struct netkit *nk = netkit_priv(dev);
245 	struct net_device *peer = rtnl_dereference(nk->peer);
246 
247 	return peer ? dev_net(peer) : dev_net(dev);
248 }
249 
250 static int netkit_check_policy(int policy, struct nlattr *tb,
251 			       struct netlink_ext_ack *extack)
252 {
253 	switch (policy) {
254 	case NETKIT_PASS:
255 	case NETKIT_DROP:
256 		return 0;
257 	default:
258 		NL_SET_ERR_MSG_ATTR(extack, tb,
259 				    "Provided default xmit policy not supported");
260 		return -EINVAL;
261 	}
262 }
263 
264 static int netkit_check_mode(int mode, struct nlattr *tb,
265 			     struct netlink_ext_ack *extack)
266 {
267 	switch (mode) {
268 	case NETKIT_L2:
269 	case NETKIT_L3:
270 		return 0;
271 	default:
272 		NL_SET_ERR_MSG_ATTR(extack, tb,
273 				    "Provided device mode can only be L2 or L3");
274 		return -EINVAL;
275 	}
276 }
277 
278 static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
279 			   struct netlink_ext_ack *extack)
280 {
281 	struct nlattr *attr = tb[IFLA_ADDRESS];
282 
283 	if (!attr)
284 		return 0;
285 	NL_SET_ERR_MSG_ATTR(extack, attr,
286 			    "Setting Ethernet address is not supported");
287 	return -EOPNOTSUPP;
288 }
289 
290 static struct rtnl_link_ops netkit_link_ops;
291 
292 static int netkit_new_link(struct net *src_net, struct net_device *dev,
293 			   struct nlattr *tb[], struct nlattr *data[],
294 			   struct netlink_ext_ack *extack)
295 {
296 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
297 	enum netkit_action default_prim = NETKIT_PASS;
298 	enum netkit_action default_peer = NETKIT_PASS;
299 	enum netkit_mode mode = NETKIT_L3;
300 	unsigned char ifname_assign_type;
301 	struct ifinfomsg *ifmp = NULL;
302 	struct net_device *peer;
303 	char ifname[IFNAMSIZ];
304 	struct netkit *nk;
305 	struct net *net;
306 	int err;
307 
308 	if (data) {
309 		if (data[IFLA_NETKIT_MODE]) {
310 			attr = data[IFLA_NETKIT_MODE];
311 			mode = nla_get_u32(attr);
312 			err = netkit_check_mode(mode, attr, extack);
313 			if (err < 0)
314 				return err;
315 		}
316 		if (data[IFLA_NETKIT_PEER_INFO]) {
317 			attr = data[IFLA_NETKIT_PEER_INFO];
318 			ifmp = nla_data(attr);
319 			err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
320 			if (err < 0)
321 				return err;
322 			err = netkit_validate(peer_tb, NULL, extack);
323 			if (err < 0)
324 				return err;
325 			tbp = peer_tb;
326 		}
327 		if (data[IFLA_NETKIT_POLICY]) {
328 			attr = data[IFLA_NETKIT_POLICY];
329 			default_prim = nla_get_u32(attr);
330 			err = netkit_check_policy(default_prim, attr, extack);
331 			if (err < 0)
332 				return err;
333 		}
334 		if (data[IFLA_NETKIT_PEER_POLICY]) {
335 			attr = data[IFLA_NETKIT_PEER_POLICY];
336 			default_peer = nla_get_u32(attr);
337 			err = netkit_check_policy(default_peer, attr, extack);
338 			if (err < 0)
339 				return err;
340 		}
341 	}
342 
343 	if (ifmp && tbp[IFLA_IFNAME]) {
344 		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
345 		ifname_assign_type = NET_NAME_USER;
346 	} else {
347 		strscpy(ifname, "nk%d", IFNAMSIZ);
348 		ifname_assign_type = NET_NAME_ENUM;
349 	}
350 
351 	net = rtnl_link_get_net(src_net, tbp);
352 	if (IS_ERR(net))
353 		return PTR_ERR(net);
354 
355 	peer = rtnl_create_link(net, ifname, ifname_assign_type,
356 				&netkit_link_ops, tbp, extack);
357 	if (IS_ERR(peer)) {
358 		put_net(net);
359 		return PTR_ERR(peer);
360 	}
361 
362 	netif_inherit_tso_max(peer, dev);
363 
364 	if (mode == NETKIT_L2)
365 		eth_hw_addr_random(peer);
366 	if (ifmp && dev->ifindex)
367 		peer->ifindex = ifmp->ifi_index;
368 
369 	nk = netkit_priv(peer);
370 	nk->primary = false;
371 	nk->policy = default_peer;
372 	nk->mode = mode;
373 	bpf_mprog_bundle_init(&nk->bundle);
374 
375 	err = register_netdevice(peer);
376 	put_net(net);
377 	if (err < 0)
378 		goto err_register_peer;
379 	netif_carrier_off(peer);
380 	if (mode == NETKIT_L2)
381 		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
382 
383 	err = rtnl_configure_link(peer, NULL, 0, NULL);
384 	if (err < 0)
385 		goto err_configure_peer;
386 
387 	if (mode == NETKIT_L2)
388 		eth_hw_addr_random(dev);
389 	if (tb[IFLA_IFNAME])
390 		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
391 	else
392 		strscpy(dev->name, "nk%d", IFNAMSIZ);
393 
394 	nk = netkit_priv(dev);
395 	nk->primary = true;
396 	nk->policy = default_prim;
397 	nk->mode = mode;
398 	bpf_mprog_bundle_init(&nk->bundle);
399 
400 	err = register_netdevice(dev);
401 	if (err < 0)
402 		goto err_configure_peer;
403 	netif_carrier_off(dev);
404 	if (mode == NETKIT_L2)
405 		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
406 
407 	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
408 	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
409 	return 0;
410 err_configure_peer:
411 	unregister_netdevice(peer);
412 	return err;
413 err_register_peer:
414 	free_netdev(peer);
415 	return err;
416 }
417 
418 static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
419 						  bool bundle_fallback)
420 {
421 	struct netkit *nk = netkit_priv(dev);
422 	struct bpf_mprog_entry *entry;
423 
424 	ASSERT_RTNL();
425 	entry = rcu_dereference_rtnl(nk->active);
426 	if (entry)
427 		return entry;
428 	if (bundle_fallback)
429 		return &nk->bundle.a;
430 	return NULL;
431 }
432 
433 static void netkit_entry_update(struct net_device *dev,
434 				struct bpf_mprog_entry *entry)
435 {
436 	struct netkit *nk = netkit_priv(dev);
437 
438 	ASSERT_RTNL();
439 	rcu_assign_pointer(nk->active, entry);
440 }
441 
442 static void netkit_entry_sync(void)
443 {
444 	synchronize_rcu();
445 }
446 
447 static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
448 {
449 	struct net_device *dev;
450 	struct netkit *nk;
451 
452 	ASSERT_RTNL();
453 
454 	switch (which) {
455 	case BPF_NETKIT_PRIMARY:
456 	case BPF_NETKIT_PEER:
457 		break;
458 	default:
459 		return ERR_PTR(-EINVAL);
460 	}
461 
462 	dev = __dev_get_by_index(net, ifindex);
463 	if (!dev)
464 		return ERR_PTR(-ENODEV);
465 	if (dev->netdev_ops != &netkit_netdev_ops)
466 		return ERR_PTR(-ENXIO);
467 
468 	nk = netkit_priv(dev);
469 	if (!nk->primary)
470 		return ERR_PTR(-EACCES);
471 	if (which == BPF_NETKIT_PEER) {
472 		dev = rcu_dereference_rtnl(nk->peer);
473 		if (!dev)
474 			return ERR_PTR(-ENODEV);
475 	}
476 	return dev;
477 }
478 
479 int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
480 {
481 	struct bpf_mprog_entry *entry, *entry_new;
482 	struct bpf_prog *replace_prog = NULL;
483 	struct net_device *dev;
484 	int ret;
485 
486 	rtnl_lock();
487 	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
488 			       attr->attach_type);
489 	if (IS_ERR(dev)) {
490 		ret = PTR_ERR(dev);
491 		goto out;
492 	}
493 	entry = netkit_entry_fetch(dev, true);
494 	if (attr->attach_flags & BPF_F_REPLACE) {
495 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
496 						 prog->type);
497 		if (IS_ERR(replace_prog)) {
498 			ret = PTR_ERR(replace_prog);
499 			replace_prog = NULL;
500 			goto out;
501 		}
502 	}
503 	ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
504 			       attr->attach_flags, attr->relative_fd,
505 			       attr->expected_revision);
506 	if (!ret) {
507 		if (entry != entry_new) {
508 			netkit_entry_update(dev, entry_new);
509 			netkit_entry_sync();
510 		}
511 		bpf_mprog_commit(entry);
512 	}
513 out:
514 	if (replace_prog)
515 		bpf_prog_put(replace_prog);
516 	rtnl_unlock();
517 	return ret;
518 }
519 
520 int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
521 {
522 	struct bpf_mprog_entry *entry, *entry_new;
523 	struct net_device *dev;
524 	int ret;
525 
526 	rtnl_lock();
527 	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
528 			       attr->attach_type);
529 	if (IS_ERR(dev)) {
530 		ret = PTR_ERR(dev);
531 		goto out;
532 	}
533 	entry = netkit_entry_fetch(dev, false);
534 	if (!entry) {
535 		ret = -ENOENT;
536 		goto out;
537 	}
538 	ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
539 			       attr->relative_fd, attr->expected_revision);
540 	if (!ret) {
541 		if (!bpf_mprog_total(entry_new))
542 			entry_new = NULL;
543 		netkit_entry_update(dev, entry_new);
544 		netkit_entry_sync();
545 		bpf_mprog_commit(entry);
546 	}
547 out:
548 	rtnl_unlock();
549 	return ret;
550 }
551 
552 int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
553 {
554 	struct net_device *dev;
555 	int ret;
556 
557 	rtnl_lock();
558 	dev = netkit_dev_fetch(current->nsproxy->net_ns,
559 			       attr->query.target_ifindex,
560 			       attr->query.attach_type);
561 	if (IS_ERR(dev)) {
562 		ret = PTR_ERR(dev);
563 		goto out;
564 	}
565 	ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
566 out:
567 	rtnl_unlock();
568 	return ret;
569 }
570 
571 static struct netkit_link *netkit_link(const struct bpf_link *link)
572 {
573 	return container_of(link, struct netkit_link, link);
574 }
575 
576 static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
577 				   u32 id_or_fd, u64 revision)
578 {
579 	struct netkit_link *nkl = netkit_link(link);
580 	struct bpf_mprog_entry *entry, *entry_new;
581 	struct net_device *dev = nkl->dev;
582 	int ret;
583 
584 	ASSERT_RTNL();
585 	entry = netkit_entry_fetch(dev, true);
586 	ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
587 			       id_or_fd, revision);
588 	if (!ret) {
589 		if (entry != entry_new) {
590 			netkit_entry_update(dev, entry_new);
591 			netkit_entry_sync();
592 		}
593 		bpf_mprog_commit(entry);
594 	}
595 	return ret;
596 }
597 
598 static void netkit_link_release(struct bpf_link *link)
599 {
600 	struct netkit_link *nkl = netkit_link(link);
601 	struct bpf_mprog_entry *entry, *entry_new;
602 	struct net_device *dev;
603 	int ret = 0;
604 
605 	rtnl_lock();
606 	dev = nkl->dev;
607 	if (!dev)
608 		goto out;
609 	entry = netkit_entry_fetch(dev, false);
610 	if (!entry) {
611 		ret = -ENOENT;
612 		goto out;
613 	}
614 	ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
615 	if (!ret) {
616 		if (!bpf_mprog_total(entry_new))
617 			entry_new = NULL;
618 		netkit_entry_update(dev, entry_new);
619 		netkit_entry_sync();
620 		bpf_mprog_commit(entry);
621 		nkl->dev = NULL;
622 	}
623 out:
624 	WARN_ON_ONCE(ret);
625 	rtnl_unlock();
626 }
627 
628 static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
629 			      struct bpf_prog *oprog)
630 {
631 	struct netkit_link *nkl = netkit_link(link);
632 	struct bpf_mprog_entry *entry, *entry_new;
633 	struct net_device *dev;
634 	int ret = 0;
635 
636 	rtnl_lock();
637 	dev = nkl->dev;
638 	if (!dev) {
639 		ret = -ENOLINK;
640 		goto out;
641 	}
642 	if (oprog && link->prog != oprog) {
643 		ret = -EPERM;
644 		goto out;
645 	}
646 	oprog = link->prog;
647 	if (oprog == nprog) {
648 		bpf_prog_put(nprog);
649 		goto out;
650 	}
651 	entry = netkit_entry_fetch(dev, false);
652 	if (!entry) {
653 		ret = -ENOENT;
654 		goto out;
655 	}
656 	ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
657 			       BPF_F_REPLACE | BPF_F_ID,
658 			       link->prog->aux->id, 0);
659 	if (!ret) {
660 		WARN_ON_ONCE(entry != entry_new);
661 		oprog = xchg(&link->prog, nprog);
662 		bpf_prog_put(oprog);
663 		bpf_mprog_commit(entry);
664 	}
665 out:
666 	rtnl_unlock();
667 	return ret;
668 }
669 
670 static void netkit_link_dealloc(struct bpf_link *link)
671 {
672 	kfree(netkit_link(link));
673 }
674 
675 static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
676 {
677 	const struct netkit_link *nkl = netkit_link(link);
678 	u32 ifindex = 0;
679 
680 	rtnl_lock();
681 	if (nkl->dev)
682 		ifindex = nkl->dev->ifindex;
683 	rtnl_unlock();
684 
685 	seq_printf(seq, "ifindex:\t%u\n", ifindex);
686 	seq_printf(seq, "attach_type:\t%u (%s)\n",
687 		   nkl->location,
688 		   nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer");
689 }
690 
691 static int netkit_link_fill_info(const struct bpf_link *link,
692 				 struct bpf_link_info *info)
693 {
694 	const struct netkit_link *nkl = netkit_link(link);
695 	u32 ifindex = 0;
696 
697 	rtnl_lock();
698 	if (nkl->dev)
699 		ifindex = nkl->dev->ifindex;
700 	rtnl_unlock();
701 
702 	info->netkit.ifindex = ifindex;
703 	info->netkit.attach_type = nkl->location;
704 	return 0;
705 }
706 
707 static int netkit_link_detach(struct bpf_link *link)
708 {
709 	netkit_link_release(link);
710 	return 0;
711 }
712 
713 static const struct bpf_link_ops netkit_link_lops = {
714 	.release	= netkit_link_release,
715 	.detach		= netkit_link_detach,
716 	.dealloc	= netkit_link_dealloc,
717 	.update_prog	= netkit_link_update,
718 	.show_fdinfo	= netkit_link_fdinfo,
719 	.fill_link_info	= netkit_link_fill_info,
720 };
721 
722 static int netkit_link_init(struct netkit_link *nkl,
723 			    struct bpf_link_primer *link_primer,
724 			    const union bpf_attr *attr,
725 			    struct net_device *dev,
726 			    struct bpf_prog *prog)
727 {
728 	bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
729 		      &netkit_link_lops, prog);
730 	nkl->location = attr->link_create.attach_type;
731 	nkl->dev = dev;
732 	return bpf_link_prime(&nkl->link, link_primer);
733 }
734 
735 int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
736 {
737 	struct bpf_link_primer link_primer;
738 	struct netkit_link *nkl;
739 	struct net_device *dev;
740 	int ret;
741 
742 	rtnl_lock();
743 	dev = netkit_dev_fetch(current->nsproxy->net_ns,
744 			       attr->link_create.target_ifindex,
745 			       attr->link_create.attach_type);
746 	if (IS_ERR(dev)) {
747 		ret = PTR_ERR(dev);
748 		goto out;
749 	}
750 	nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT);
751 	if (!nkl) {
752 		ret = -ENOMEM;
753 		goto out;
754 	}
755 	ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
756 	if (ret) {
757 		kfree(nkl);
758 		goto out;
759 	}
760 	ret = netkit_link_prog_attach(&nkl->link,
761 				      attr->link_create.flags,
762 				      attr->link_create.netkit.relative_fd,
763 				      attr->link_create.netkit.expected_revision);
764 	if (ret) {
765 		nkl->dev = NULL;
766 		bpf_link_cleanup(&link_primer);
767 		goto out;
768 	}
769 	ret = bpf_link_settle(&link_primer);
770 out:
771 	rtnl_unlock();
772 	return ret;
773 }
774 
775 static void netkit_release_all(struct net_device *dev)
776 {
777 	struct bpf_mprog_entry *entry;
778 	struct bpf_tuple tuple = {};
779 	struct bpf_mprog_fp *fp;
780 	struct bpf_mprog_cp *cp;
781 
782 	entry = netkit_entry_fetch(dev, false);
783 	if (!entry)
784 		return;
785 	netkit_entry_update(dev, NULL);
786 	netkit_entry_sync();
787 	bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
788 		if (tuple.link)
789 			netkit_link(tuple.link)->dev = NULL;
790 		else
791 			bpf_prog_put(tuple.prog);
792 	}
793 }
794 
795 static void netkit_uninit(struct net_device *dev)
796 {
797 	netkit_release_all(dev);
798 }
799 
800 static void netkit_del_link(struct net_device *dev, struct list_head *head)
801 {
802 	struct netkit *nk = netkit_priv(dev);
803 	struct net_device *peer = rtnl_dereference(nk->peer);
804 
805 	RCU_INIT_POINTER(nk->peer, NULL);
806 	unregister_netdevice_queue(dev, head);
807 	if (peer) {
808 		nk = netkit_priv(peer);
809 		RCU_INIT_POINTER(nk->peer, NULL);
810 		unregister_netdevice_queue(peer, head);
811 	}
812 }
813 
814 static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
815 			      struct nlattr *data[],
816 			      struct netlink_ext_ack *extack)
817 {
818 	struct netkit *nk = netkit_priv(dev);
819 	struct net_device *peer = rtnl_dereference(nk->peer);
820 	enum netkit_action policy;
821 	struct nlattr *attr;
822 	int err;
823 
824 	if (!nk->primary) {
825 		NL_SET_ERR_MSG(extack,
826 			       "netkit link settings can be changed only through the primary device");
827 		return -EACCES;
828 	}
829 
830 	if (data[IFLA_NETKIT_MODE]) {
831 		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
832 				    "netkit link operating mode cannot be changed after device creation");
833 		return -EACCES;
834 	}
835 
836 	if (data[IFLA_NETKIT_POLICY]) {
837 		attr = data[IFLA_NETKIT_POLICY];
838 		policy = nla_get_u32(attr);
839 		err = netkit_check_policy(policy, attr, extack);
840 		if (err)
841 			return err;
842 		WRITE_ONCE(nk->policy, policy);
843 	}
844 
845 	if (data[IFLA_NETKIT_PEER_POLICY]) {
846 		err = -EOPNOTSUPP;
847 		attr = data[IFLA_NETKIT_PEER_POLICY];
848 		policy = nla_get_u32(attr);
849 		if (peer)
850 			err = netkit_check_policy(policy, attr, extack);
851 		if (err)
852 			return err;
853 		nk = netkit_priv(peer);
854 		WRITE_ONCE(nk->policy, policy);
855 	}
856 
857 	return 0;
858 }
859 
860 static size_t netkit_get_size(const struct net_device *dev)
861 {
862 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
863 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
864 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
865 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
866 	       0;
867 }
868 
869 static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
870 {
871 	struct netkit *nk = netkit_priv(dev);
872 	struct net_device *peer = rtnl_dereference(nk->peer);
873 
874 	if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
875 		return -EMSGSIZE;
876 	if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
877 		return -EMSGSIZE;
878 	if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
879 		return -EMSGSIZE;
880 
881 	if (peer) {
882 		nk = netkit_priv(peer);
883 		if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
884 			return -EMSGSIZE;
885 	}
886 
887 	return 0;
888 }
889 
890 static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
891 	[IFLA_NETKIT_PEER_INFO]		= { .len = sizeof(struct ifinfomsg) },
892 	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
893 	[IFLA_NETKIT_MODE]		= { .type = NLA_U32 },
894 	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
895 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
896 					    .reject_message = "Primary attribute is read-only" },
897 };
898 
899 static struct rtnl_link_ops netkit_link_ops = {
900 	.kind		= DRV_NAME,
901 	.priv_size	= sizeof(struct netkit),
902 	.setup		= netkit_setup,
903 	.newlink	= netkit_new_link,
904 	.dellink	= netkit_del_link,
905 	.changelink	= netkit_change_link,
906 	.get_link_net	= netkit_get_link_net,
907 	.get_size	= netkit_get_size,
908 	.fill_info	= netkit_fill_info,
909 	.policy		= netkit_policy,
910 	.validate	= netkit_validate,
911 	.maxtype	= IFLA_NETKIT_MAX,
912 };
913 
914 static __init int netkit_init(void)
915 {
916 	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
917 		     (int)NETKIT_PASS != (int)TCX_PASS ||
918 		     (int)NETKIT_DROP != (int)TCX_DROP ||
919 		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
920 
921 	return rtnl_link_register(&netkit_link_ops);
922 }
923 
924 static __exit void netkit_exit(void)
925 {
926 	rtnl_link_unregister(&netkit_link_ops);
927 }
928 
929 module_init(netkit_init);
930 module_exit(netkit_exit);
931 
932 MODULE_DESCRIPTION("BPF-programmable network device");
933 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
934 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
935 MODULE_LICENSE("GPL");
936 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
937