xref: /linux/tools/testing/selftests/bpf/prog_tests/tc_redirect.c (revision 90b83efa6701656e02c86e7df2cb1765ea602d07)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /*
4  * This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
5  * between src and dst. The netns fwd has veth links to each src and dst. The
6  * client is in src and server in dst. The test installs a TC BPF program to each
7  * host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
8  * neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
9  * switch from ingress side; it also installs a checker prog on the egress side
10  * to drop unexpected traffic.
11  */
12 
13 #include <arpa/inet.h>
14 #include <linux/if_tun.h>
15 #include <linux/limits.h>
16 #include <linux/sysctl.h>
17 #include <linux/time_types.h>
18 #include <linux/net_tstamp.h>
19 #include <net/if.h>
20 #include <stdbool.h>
21 #include <stdio.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include "test_progs.h"
26 #include "network_helpers.h"
27 #include "netlink_helpers.h"
28 #include "test_tc_neigh_fib.skel.h"
29 #include "test_tc_neigh.skel.h"
30 #include "test_tc_peer.skel.h"
31 #include "test_tc_dtime.skel.h"
32 
33 #ifndef TCP_TX_DELAY
34 #define TCP_TX_DELAY 37
35 #endif
36 
37 #define NS_SRC "ns_src"
38 #define NS_FWD "ns_fwd"
39 #define NS_DST "ns_dst"
40 
41 #define IP4_SRC "172.16.1.100"
42 #define IP4_DST "172.16.2.100"
43 #define IP4_TUN_SRC "172.17.1.100"
44 #define IP4_TUN_FWD "172.17.1.200"
45 #define IP4_PORT 9004
46 
47 #define IP6_SRC "0::1:dead:beef:cafe"
48 #define IP6_DST "0::2:dead:beef:cafe"
49 #define IP6_TUN_SRC "1::1:dead:beef:cafe"
50 #define IP6_TUN_FWD "1::2:dead:beef:cafe"
51 #define IP6_PORT 9006
52 
53 #define IP4_SLL "169.254.0.1"
54 #define IP4_DLL "169.254.0.2"
55 #define IP4_NET "169.254.0.0"
56 
57 #define MAC_DST_FWD "00:11:22:33:44:55"
58 #define MAC_DST "00:22:33:44:55:66"
59 #define MAC_SRC_FWD "00:33:44:55:66:77"
60 #define MAC_SRC "00:44:55:66:77:88"
61 
62 #define IFADDR_STR_LEN 18
63 #define PING_ARGS "-i 0.2 -c 3 -w 10 -q"
64 
65 #define TIMEOUT_MILLIS 10000
66 #define NSEC_PER_SEC 1000000000ULL
67 
68 #define log_err(MSG, ...) \
69 	fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
70 		__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
71 
72 static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
73 static struct netns_obj *netns_objs[3];
74 
75 static int write_file(const char *path, const char *newval)
76 {
77 	FILE *f;
78 
79 	f = fopen(path, "r+");
80 	if (!f)
81 		return -1;
82 	if (fwrite(newval, strlen(newval), 1, f) != 1) {
83 		log_err("writing to %s failed", path);
84 		fclose(f);
85 		return -1;
86 	}
87 	fclose(f);
88 	return 0;
89 }
90 
91 static int netns_setup_namespaces(const char *verb)
92 {
93 	struct netns_obj **ns_obj = netns_objs;
94 	const char * const *ns = namespaces;
95 
96 	while (*ns) {
97 		if (strcmp(verb, "add") == 0) {
98 			*ns_obj = netns_new(*ns, false);
99 			if (!ASSERT_OK_PTR(*ns_obj, "netns_new"))
100 				return -1;
101 		} else {
102 			if (!ASSERT_OK_PTR(*ns_obj, "netns_obj is NULL"))
103 				return -1;
104 			netns_free(*ns_obj);
105 			*ns_obj = NULL;
106 		}
107 		ns++;
108 		ns_obj++;
109 	}
110 	return 0;
111 }
112 
113 static void netns_setup_namespaces_nofail(const char *verb)
114 {
115 	struct netns_obj **ns_obj = netns_objs;
116 	const char * const *ns = namespaces;
117 
118 	while (*ns) {
119 		if (strcmp(verb, "add") == 0) {
120 			*ns_obj = netns_new(*ns, false);
121 		} else {
122 			if (*ns_obj)
123 				netns_free(*ns_obj);
124 			*ns_obj = NULL;
125 		}
126 		ns++;
127 		ns_obj++;
128 	}
129 }
130 
131 enum dev_mode {
132 	MODE_VETH,
133 	MODE_NETKIT,
134 };
135 
136 struct netns_setup_result {
137 	enum dev_mode dev_mode;
138 	int ifindex_src;
139 	int ifindex_src_fwd;
140 	int ifindex_dst;
141 	int ifindex_dst_fwd;
142 };
143 
144 static int get_ifaddr(const char *name, char *ifaddr)
145 {
146 	char path[PATH_MAX];
147 	FILE *f;
148 	int ret;
149 
150 	snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name);
151 	f = fopen(path, "r");
152 	if (!ASSERT_OK_PTR(f, path))
153 		return -1;
154 
155 	ret = fread(ifaddr, 1, IFADDR_STR_LEN, f);
156 	if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) {
157 		fclose(f);
158 		return -1;
159 	}
160 	fclose(f);
161 	return 0;
162 }
163 
164 static int create_netkit(int mode, char *prim, char *peer)
165 {
166 	struct rtattr *linkinfo, *data, *peer_info;
167 	struct rtnl_handle rth = { .fd = -1 };
168 	const char *type = "netkit";
169 	struct {
170 		struct nlmsghdr n;
171 		struct ifinfomsg i;
172 		char buf[1024];
173 	} req = {};
174 	int err;
175 
176 	err = rtnl_open(&rth, 0);
177 	if (!ASSERT_OK(err, "open_rtnetlink"))
178 		return err;
179 
180 	memset(&req, 0, sizeof(req));
181 	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
182 	req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
183 	req.n.nlmsg_type = RTM_NEWLINK;
184 	req.i.ifi_family = AF_UNSPEC;
185 
186 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim));
187 	linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO);
188 	addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type));
189 	data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA);
190 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
191 	peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO);
192 	req.n.nlmsg_len += sizeof(struct ifinfomsg);
193 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer));
194 	addattr_nest_end(&req.n, peer_info);
195 	addattr_nest_end(&req.n, data);
196 	addattr_nest_end(&req.n, linkinfo);
197 
198 	err = rtnl_talk(&rth, &req.n, NULL);
199 	ASSERT_OK(err, "talk_rtnetlink");
200 	rtnl_close(&rth);
201 	return err;
202 }
203 
204 static int netns_setup_links_and_routes(struct netns_setup_result *result)
205 {
206 	struct nstoken *nstoken = NULL;
207 	char src_fwd_addr[IFADDR_STR_LEN+1] = {};
208 	char src_addr[IFADDR_STR_LEN + 1] = {};
209 	int err;
210 
211 	if (result->dev_mode == MODE_VETH) {
212 		SYS(fail, "ip link add src address " MAC_SRC " type veth "
213 			  "peer name src_fwd address " MAC_SRC_FWD);
214 		SYS(fail, "ip link add dst address " MAC_DST " type veth "
215 			  "peer name dst_fwd address " MAC_DST_FWD);
216 	} else if (result->dev_mode == MODE_NETKIT) {
217 		err = create_netkit(NETKIT_L3, "src", "src_fwd");
218 		if (!ASSERT_OK(err, "create_ifindex_src"))
219 			goto fail;
220 		err = create_netkit(NETKIT_L3, "dst", "dst_fwd");
221 		if (!ASSERT_OK(err, "create_ifindex_dst"))
222 			goto fail;
223 	}
224 
225 	if (get_ifaddr("src_fwd", src_fwd_addr))
226 		goto fail;
227 
228 	if (get_ifaddr("src", src_addr))
229 		goto fail;
230 
231 	result->ifindex_src = if_nametoindex("src");
232 	if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src"))
233 		goto fail;
234 
235 	result->ifindex_src_fwd = if_nametoindex("src_fwd");
236 	if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd"))
237 		goto fail;
238 
239 	result->ifindex_dst = if_nametoindex("dst");
240 	if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst"))
241 		goto fail;
242 
243 	result->ifindex_dst_fwd = if_nametoindex("dst_fwd");
244 	if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd"))
245 		goto fail;
246 
247 	SYS(fail, "ip link set src netns " NS_SRC);
248 	SYS(fail, "ip link set src_fwd netns " NS_FWD);
249 	SYS(fail, "ip link set dst_fwd netns " NS_FWD);
250 	SYS(fail, "ip link set dst netns " NS_DST);
251 
252 	/** setup in 'src' namespace */
253 	nstoken = open_netns(NS_SRC);
254 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
255 		goto fail;
256 
257 	SYS(fail, "ip addr add " IP4_SRC "/32 dev src");
258 	SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad");
259 	SYS(fail, "ip link set dev src up");
260 
261 	SYS(fail, "ip route add " IP4_DST "/32 dev src scope global");
262 	SYS(fail, "ip route add " IP4_NET "/16 dev src scope global");
263 	SYS(fail, "ip route add " IP6_DST "/128 dev src scope global");
264 
265 	if (result->dev_mode == MODE_VETH) {
266 		SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s",
267 		    src_fwd_addr);
268 		SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s",
269 		    src_fwd_addr);
270 	}
271 
272 	close_netns(nstoken);
273 
274 	/** setup in 'fwd' namespace */
275 	nstoken = open_netns(NS_FWD);
276 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
277 		goto fail;
278 
279 	/* The fwd netns automatically gets a v6 LL address / routes, but also
280 	 * needs v4 one in order to start ARP probing. IP4_NET route is added
281 	 * to the endpoints so that the ARP processing will reply.
282 	 */
283 	SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd");
284 	SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd");
285 	SYS(fail, "ip link set dev src_fwd up");
286 	SYS(fail, "ip link set dev dst_fwd up");
287 
288 	SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global");
289 	SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global");
290 	SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global");
291 	SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global");
292 
293 	if (result->dev_mode == MODE_VETH) {
294 		SYS(fail, "ip neigh add " IP4_SRC " dev src_fwd lladdr %s", src_addr);
295 		SYS(fail, "ip neigh add " IP6_SRC " dev src_fwd lladdr %s", src_addr);
296 		SYS(fail, "ip neigh add " IP4_DST " dev dst_fwd lladdr %s", MAC_DST);
297 		SYS(fail, "ip neigh add " IP6_DST " dev dst_fwd lladdr %s", MAC_DST);
298 	}
299 
300 	close_netns(nstoken);
301 
302 	/** setup in 'dst' namespace */
303 	nstoken = open_netns(NS_DST);
304 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
305 		goto fail;
306 
307 	SYS(fail, "ip addr add " IP4_DST "/32 dev dst");
308 	SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad");
309 	SYS(fail, "ip link set dev dst up");
310 	SYS(fail, "ip link set dev lo up");
311 
312 	SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global");
313 	SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global");
314 	SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global");
315 
316 	if (result->dev_mode == MODE_VETH) {
317 		SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD);
318 		SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD);
319 	}
320 
321 	close_netns(nstoken);
322 
323 	return 0;
324 fail:
325 	if (nstoken)
326 		close_netns(nstoken);
327 	return -1;
328 }
329 
330 static int qdisc_clsact_create(struct bpf_tc_hook *qdisc_hook, int ifindex)
331 {
332 	char err_str[128], ifname[16];
333 	int err;
334 
335 	qdisc_hook->ifindex = ifindex;
336 	qdisc_hook->attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
337 	err = bpf_tc_hook_create(qdisc_hook);
338 	snprintf(err_str, sizeof(err_str),
339 		 "qdisc add dev %s clsact",
340 		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>");
341 	err_str[sizeof(err_str) - 1] = 0;
342 	ASSERT_OK(err, err_str);
343 
344 	return err;
345 }
346 
347 static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook,
348 			     enum bpf_tc_attach_point xgress,
349 			     const struct bpf_program *prog, int priority)
350 {
351 	LIBBPF_OPTS(bpf_tc_opts, tc_attach);
352 	char err_str[128], ifname[16];
353 	int err;
354 
355 	qdisc_hook->attach_point = xgress;
356 	tc_attach.prog_fd = bpf_program__fd(prog);
357 	tc_attach.priority = priority;
358 	err = bpf_tc_attach(qdisc_hook, &tc_attach);
359 	snprintf(err_str, sizeof(err_str),
360 		 "filter add dev %s %s prio %d bpf da %s",
361 		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>",
362 		 xgress == BPF_TC_INGRESS ? "ingress" : "egress",
363 		 priority, bpf_program__name(prog));
364 	err_str[sizeof(err_str) - 1] = 0;
365 	ASSERT_OK(err, err_str);
366 
367 	return err;
368 }
369 
370 #define QDISC_CLSACT_CREATE(qdisc_hook, ifindex) ({		\
371 	if ((err = qdisc_clsact_create(qdisc_hook, ifindex)))	\
372 		goto fail;					\
373 })
374 
375 #define XGRESS_FILTER_ADD(qdisc_hook, xgress, prog, priority) ({		\
376 	if ((err = xgress_filter_add(qdisc_hook, xgress, prog, priority)))	\
377 		goto fail;							\
378 })
379 
380 static int netns_load_bpf(const struct bpf_program *src_prog,
381 			  const struct bpf_program *dst_prog,
382 			  const struct bpf_program *chk_prog,
383 			  const struct netns_setup_result *setup_result)
384 {
385 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
386 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
387 	int err;
388 
389 	/* tc qdisc add dev src_fwd clsact */
390 	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
391 	/* tc filter add dev src_fwd ingress bpf da src_prog */
392 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0);
393 	/* tc filter add dev src_fwd egress bpf da chk_prog */
394 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0);
395 
396 	/* tc qdisc add dev dst_fwd clsact */
397 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
398 	/* tc filter add dev dst_fwd ingress bpf da dst_prog */
399 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0);
400 	/* tc filter add dev dst_fwd egress bpf da chk_prog */
401 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0);
402 
403 	return 0;
404 fail:
405 	return -1;
406 }
407 
408 static void test_tcp(int family, const char *addr, __u16 port)
409 {
410 	int listen_fd = -1, accept_fd = -1, client_fd = -1;
411 	char buf[] = "testing testing";
412 	int n;
413 	struct nstoken *nstoken;
414 
415 	nstoken = open_netns(NS_DST);
416 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
417 		return;
418 
419 	listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
420 	if (!ASSERT_GE(listen_fd, 0, "listen"))
421 		goto done;
422 
423 	close_netns(nstoken);
424 	nstoken = open_netns(NS_SRC);
425 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
426 		goto done;
427 
428 	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
429 	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
430 		goto done;
431 
432 	accept_fd = accept(listen_fd, NULL, NULL);
433 	if (!ASSERT_GE(accept_fd, 0, "accept"))
434 		goto done;
435 
436 	if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo"))
437 		goto done;
438 
439 	n = write(client_fd, buf, sizeof(buf));
440 	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
441 		goto done;
442 
443 	n = read(accept_fd, buf, sizeof(buf));
444 	ASSERT_EQ(n, sizeof(buf), "recv from server");
445 
446 done:
447 	if (nstoken)
448 		close_netns(nstoken);
449 	if (listen_fd >= 0)
450 		close(listen_fd);
451 	if (accept_fd >= 0)
452 		close(accept_fd);
453 	if (client_fd >= 0)
454 		close(client_fd);
455 }
456 
457 static int test_ping(int family, const char *addr)
458 {
459 	SYS(fail, "ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr);
460 	return 0;
461 fail:
462 	return -1;
463 }
464 
465 static void test_connectivity(void)
466 {
467 	test_tcp(AF_INET, IP4_DST, IP4_PORT);
468 	test_ping(AF_INET, IP4_DST);
469 	test_tcp(AF_INET6, IP6_DST, IP6_PORT);
470 	test_ping(AF_INET6, IP6_DST);
471 }
472 
473 static int set_forwarding(bool enable)
474 {
475 	int err;
476 
477 	err = write_file("/proc/sys/net/ipv4/ip_forward", enable ? "1" : "0");
478 	if (!ASSERT_OK(err, "set ipv4.ip_forward=0"))
479 		return err;
480 
481 	err = write_file("/proc/sys/net/ipv6/conf/all/forwarding", enable ? "1" : "0");
482 	if (!ASSERT_OK(err, "set ipv6.forwarding=0"))
483 		return err;
484 
485 	return 0;
486 }
487 
488 static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp)
489 {
490 	struct timespec pkt_ts = {};
491 	char ctl[CMSG_SPACE(sizeof(pkt_ts))];
492 	struct timespec now_ts;
493 	struct msghdr msg = {};
494 	__u64 now_ns, pkt_ns;
495 	struct cmsghdr *cmsg;
496 	struct iovec iov;
497 	char data[32];
498 	int ret;
499 
500 	iov.iov_base = data;
501 	iov.iov_len = sizeof(data);
502 	msg.msg_iov = &iov;
503 	msg.msg_iovlen = 1;
504 	msg.msg_control = &ctl;
505 	msg.msg_controllen = sizeof(ctl);
506 
507 	ret = recvmsg(fd, &msg, 0);
508 	if (!ASSERT_EQ(ret, s, "recvmsg"))
509 		return -1;
510 	ASSERT_STRNEQ(data, expected, s, "expected rcv data");
511 
512 	cmsg = CMSG_FIRSTHDR(&msg);
513 	if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
514 	    cmsg->cmsg_type == SO_TIMESTAMPNS)
515 		memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts));
516 
517 	pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec;
518 	if (tstamp) {
519 		/* caller will check the tstamp itself */
520 		*tstamp = pkt_ns;
521 		return 0;
522 	}
523 
524 	ASSERT_NEQ(pkt_ns, 0, "pkt rcv tstamp");
525 
526 	ret = clock_gettime(CLOCK_REALTIME, &now_ts);
527 	ASSERT_OK(ret, "clock_gettime");
528 	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
529 
530 	if (ASSERT_GE(now_ns, pkt_ns, "check rcv tstamp"))
531 		ASSERT_LT(now_ns - pkt_ns, 5 * NSEC_PER_SEC,
532 			  "check rcv tstamp");
533 	return 0;
534 }
535 
536 static void rcv_tstamp(int fd, const char *expected, size_t s)
537 {
538 	__rcv_tstamp(fd, expected, s, NULL);
539 }
540 
541 static int wait_netstamp_needed_key(void)
542 {
543 	int opt = 1, srv_fd = -1, cli_fd = -1, nretries = 0, err, n;
544 	char buf[] = "testing testing";
545 	struct nstoken *nstoken;
546 	__u64 tstamp = 0;
547 
548 	nstoken = open_netns(NS_DST);
549 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
550 		return -1;
551 
552 	srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0);
553 	if (!ASSERT_GE(srv_fd, 0, "start_server"))
554 		goto done;
555 
556 	err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS,
557 			 &opt, sizeof(opt));
558 	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
559 		goto done;
560 
561 	cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS);
562 	if (!ASSERT_GE(cli_fd, 0, "connect_to_fd"))
563 		goto done;
564 
565 again:
566 	n = write(cli_fd, buf, sizeof(buf));
567 	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
568 		goto done;
569 	err = __rcv_tstamp(srv_fd, buf, sizeof(buf), &tstamp);
570 	if (!ASSERT_OK(err, "__rcv_tstamp"))
571 		goto done;
572 	if (!tstamp && nretries++ < 5) {
573 		sleep(1);
574 		printf("netstamp_needed_key retry#%d\n", nretries);
575 		goto again;
576 	}
577 
578 done:
579 	if (!tstamp && srv_fd != -1) {
580 		close(srv_fd);
581 		srv_fd = -1;
582 	}
583 	if (cli_fd != -1)
584 		close(cli_fd);
585 	close_netns(nstoken);
586 	return srv_fd;
587 }
588 
589 static void snd_tstamp(int fd, char *b, size_t s)
590 {
591 	struct sock_txtime opt = { .clockid = CLOCK_TAI };
592 	char ctl[CMSG_SPACE(sizeof(__u64))];
593 	struct timespec now_ts;
594 	struct msghdr msg = {};
595 	struct cmsghdr *cmsg;
596 	struct iovec iov;
597 	__u64 now_ns;
598 	int ret;
599 
600 	ret = clock_gettime(CLOCK_TAI, &now_ts);
601 	ASSERT_OK(ret, "clock_get_time(CLOCK_TAI)");
602 	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
603 
604 	iov.iov_base = b;
605 	iov.iov_len = s;
606 	msg.msg_iov = &iov;
607 	msg.msg_iovlen = 1;
608 	msg.msg_control = &ctl;
609 	msg.msg_controllen = sizeof(ctl);
610 
611 	cmsg = CMSG_FIRSTHDR(&msg);
612 	cmsg->cmsg_level = SOL_SOCKET;
613 	cmsg->cmsg_type = SCM_TXTIME;
614 	cmsg->cmsg_len = CMSG_LEN(sizeof(now_ns));
615 	*(__u64 *)CMSG_DATA(cmsg) = now_ns;
616 
617 	ret = setsockopt(fd, SOL_SOCKET, SO_TXTIME, &opt, sizeof(opt));
618 	ASSERT_OK(ret, "setsockopt(SO_TXTIME)");
619 
620 	ret = sendmsg(fd, &msg, 0);
621 	ASSERT_EQ(ret, s, "sendmsg");
622 }
623 
624 static void test_inet_dtime(int family, int type, const char *addr, __u16 port)
625 {
626 	int opt = 1, accept_fd = -1, client_fd = -1, listen_fd, err;
627 	char buf[] = "testing testing";
628 	struct nstoken *nstoken;
629 
630 	nstoken = open_netns(NS_DST);
631 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
632 		return;
633 	listen_fd = start_server(family, type, addr, port, 0);
634 	close_netns(nstoken);
635 
636 	if (!ASSERT_GE(listen_fd, 0, "listen"))
637 		return;
638 
639 	/* Ensure the kernel puts the (rcv) timestamp for all skb */
640 	err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS,
641 			 &opt, sizeof(opt));
642 	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
643 		goto done;
644 
645 	if (type == SOCK_STREAM) {
646 		/* Ensure the kernel set EDT when sending out rst/ack
647 		 * from the kernel's ctl_sk.
648 		 */
649 		err = setsockopt(listen_fd, SOL_TCP, TCP_TX_DELAY, &opt,
650 				 sizeof(opt));
651 		if (!ASSERT_OK(err, "setsockopt(TCP_TX_DELAY)"))
652 			goto done;
653 	}
654 
655 	nstoken = open_netns(NS_SRC);
656 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
657 		goto done;
658 	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
659 	close_netns(nstoken);
660 
661 	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
662 		goto done;
663 
664 	if (type == SOCK_STREAM) {
665 		int n;
666 
667 		accept_fd = accept(listen_fd, NULL, NULL);
668 		if (!ASSERT_GE(accept_fd, 0, "accept"))
669 			goto done;
670 
671 		n = write(client_fd, buf, sizeof(buf));
672 		if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
673 			goto done;
674 		rcv_tstamp(accept_fd, buf, sizeof(buf));
675 	} else {
676 		snd_tstamp(client_fd, buf, sizeof(buf));
677 		rcv_tstamp(listen_fd, buf, sizeof(buf));
678 	}
679 
680 done:
681 	close(listen_fd);
682 	if (accept_fd != -1)
683 		close(accept_fd);
684 	if (client_fd != -1)
685 		close(client_fd);
686 }
687 
688 static int netns_load_dtime_bpf(struct test_tc_dtime *skel,
689 				const struct netns_setup_result *setup_result)
690 {
691 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
692 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
693 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src);
694 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst);
695 	struct nstoken *nstoken;
696 	int err;
697 
698 	/* setup ns_src tc progs */
699 	nstoken = open_netns(NS_SRC);
700 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
701 		return -1;
702 	/* tc qdisc add dev src clsact */
703 	QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src);
704 	/* tc filter add dev src ingress bpf da ingress_host */
705 	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
706 	/* tc filter add dev src egress bpf da egress_host */
707 	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0);
708 	close_netns(nstoken);
709 
710 	/* setup ns_dst tc progs */
711 	nstoken = open_netns(NS_DST);
712 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST))
713 		return -1;
714 	/* tc qdisc add dev dst clsact */
715 	QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst);
716 	/* tc filter add dev dst ingress bpf da ingress_host */
717 	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
718 	/* tc filter add dev dst egress bpf da egress_host */
719 	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0);
720 	close_netns(nstoken);
721 
722 	/* setup ns_fwd tc progs */
723 	nstoken = open_netns(NS_FWD);
724 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
725 		return -1;
726 	/* tc qdisc add dev dst_fwd clsact */
727 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
728 	/* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
729 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
730 			  skel->progs.ingress_fwdns_prio100, 100);
731 	/* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
732 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
733 			  skel->progs.ingress_fwdns_prio101, 101);
734 	/* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */
735 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
736 			  skel->progs.egress_fwdns_prio100, 100);
737 	/* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */
738 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
739 			  skel->progs.egress_fwdns_prio101, 101);
740 
741 	/* tc qdisc add dev src_fwd clsact */
742 	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
743 	/* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
744 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
745 			  skel->progs.ingress_fwdns_prio100, 100);
746 	/* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
747 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
748 			  skel->progs.ingress_fwdns_prio101, 101);
749 	/* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */
750 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
751 			  skel->progs.egress_fwdns_prio100, 100);
752 	/* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */
753 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
754 			  skel->progs.egress_fwdns_prio101, 101);
755 	close_netns(nstoken);
756 	return 0;
757 
758 fail:
759 	close_netns(nstoken);
760 	return err;
761 }
762 
763 enum {
764 	INGRESS_FWDNS_P100,
765 	INGRESS_FWDNS_P101,
766 	EGRESS_FWDNS_P100,
767 	EGRESS_FWDNS_P101,
768 	INGRESS_ENDHOST,
769 	EGRESS_ENDHOST,
770 	SET_DTIME,
771 	__MAX_CNT,
772 };
773 
774 const char *cnt_names[] = {
775 	"ingress_fwdns_p100",
776 	"ingress_fwdns_p101",
777 	"egress_fwdns_p100",
778 	"egress_fwdns_p101",
779 	"ingress_endhost",
780 	"egress_endhost",
781 	"set_dtime",
782 };
783 
784 enum {
785 	TCP_IP6_CLEAR_DTIME,
786 	TCP_IP4,
787 	TCP_IP6,
788 	UDP_IP4,
789 	UDP_IP6,
790 	TCP_IP4_RT_FWD,
791 	TCP_IP6_RT_FWD,
792 	UDP_IP4_RT_FWD,
793 	UDP_IP6_RT_FWD,
794 	UKN_TEST,
795 	__NR_TESTS,
796 };
797 
798 const char *test_names[] = {
799 	"tcp ip6 clear dtime",
800 	"tcp ip4",
801 	"tcp ip6",
802 	"udp ip4",
803 	"udp ip6",
804 	"tcp ip4 rt fwd",
805 	"tcp ip6 rt fwd",
806 	"udp ip4 rt fwd",
807 	"udp ip6 rt fwd",
808 };
809 
810 static const char *dtime_cnt_str(int test, int cnt)
811 {
812 	static char name[64];
813 
814 	snprintf(name, sizeof(name), "%s %s", test_names[test], cnt_names[cnt]);
815 
816 	return name;
817 }
818 
819 static const char *dtime_err_str(int test, int cnt)
820 {
821 	static char name[64];
822 
823 	snprintf(name, sizeof(name), "%s %s errs", test_names[test],
824 		 cnt_names[cnt]);
825 
826 	return name;
827 }
828 
829 static void test_tcp_clear_dtime(struct test_tc_dtime *skel)
830 {
831 	int i, t = TCP_IP6_CLEAR_DTIME;
832 	__u32 *dtimes = skel->bss->dtimes[t];
833 	__u32 *errs = skel->bss->errs[t];
834 
835 	skel->bss->test = t;
836 	test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 50000 + t);
837 
838 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
839 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
840 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
841 		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
842 	ASSERT_GT(dtimes[EGRESS_FWDNS_P100], 0,
843 		  dtime_cnt_str(t, EGRESS_FWDNS_P100));
844 	ASSERT_EQ(dtimes[EGRESS_FWDNS_P101], 0,
845 		  dtime_cnt_str(t, EGRESS_FWDNS_P101));
846 	ASSERT_GT(dtimes[EGRESS_ENDHOST], 0,
847 		  dtime_cnt_str(t, EGRESS_ENDHOST));
848 	ASSERT_GT(dtimes[INGRESS_ENDHOST], 0,
849 		  dtime_cnt_str(t, INGRESS_ENDHOST));
850 
851 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
852 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
853 }
854 
855 static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
856 {
857 	__u32 *dtimes, *errs;
858 	const char *addr;
859 	int i, t;
860 
861 	if (family == AF_INET) {
862 		t = bpf_fwd ? TCP_IP4 : TCP_IP4_RT_FWD;
863 		addr = IP4_DST;
864 	} else {
865 		t = bpf_fwd ? TCP_IP6 : TCP_IP6_RT_FWD;
866 		addr = IP6_DST;
867 	}
868 
869 	dtimes = skel->bss->dtimes[t];
870 	errs = skel->bss->errs[t];
871 
872 	skel->bss->test = t;
873 	test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t);
874 
875 	/* fwdns_prio100 prog does not read delivery_time_type, so
876 	 * kernel puts the (rcv) timestamp in __sk_buff->tstamp
877 	 */
878 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
879 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
880 	for (i = INGRESS_FWDNS_P101; i < SET_DTIME; i++)
881 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
882 
883 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
884 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
885 }
886 
887 static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
888 {
889 	__u32 *dtimes, *errs;
890 	const char *addr;
891 	int i, t;
892 
893 	if (family == AF_INET) {
894 		t = bpf_fwd ? UDP_IP4 : UDP_IP4_RT_FWD;
895 		addr = IP4_DST;
896 	} else {
897 		t = bpf_fwd ? UDP_IP6 : UDP_IP6_RT_FWD;
898 		addr = IP6_DST;
899 	}
900 
901 	dtimes = skel->bss->dtimes[t];
902 	errs = skel->bss->errs[t];
903 
904 	skel->bss->test = t;
905 	test_inet_dtime(family, SOCK_DGRAM, addr, 50000 + t);
906 
907 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
908 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
909 	for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
910 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
911 
912 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
913 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
914 }
915 
916 static void test_tc_redirect_dtime(struct netns_setup_result *setup_result)
917 {
918 	struct test_tc_dtime *skel;
919 	struct nstoken *nstoken;
920 	int hold_tstamp_fd, err;
921 
922 	/* Hold a sk with the SOCK_TIMESTAMP set to ensure there
923 	 * is no delay in the kernel net_enable_timestamp().
924 	 * This ensures the following tests must have
925 	 * non zero rcv tstamp in the recvmsg().
926 	 */
927 	hold_tstamp_fd = wait_netstamp_needed_key();
928 	if (!ASSERT_GE(hold_tstamp_fd, 0, "wait_netstamp_needed_key"))
929 		return;
930 
931 	skel = test_tc_dtime__open();
932 	if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open"))
933 		goto done;
934 
935 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
936 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
937 
938 	err = test_tc_dtime__load(skel);
939 	if (!ASSERT_OK(err, "test_tc_dtime__load"))
940 		goto done;
941 
942 	if (netns_load_dtime_bpf(skel, setup_result))
943 		goto done;
944 
945 	nstoken = open_netns(NS_FWD);
946 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
947 		goto done;
948 	err = set_forwarding(false);
949 	close_netns(nstoken);
950 	if (!ASSERT_OK(err, "disable forwarding"))
951 		goto done;
952 
953 	test_tcp_clear_dtime(skel);
954 
955 	test_tcp_dtime(skel, AF_INET, true);
956 	test_tcp_dtime(skel, AF_INET6, true);
957 	test_udp_dtime(skel, AF_INET, true);
958 	test_udp_dtime(skel, AF_INET6, true);
959 
960 	/* Test the kernel ip[6]_forward path instead
961 	 * of bpf_redirect_neigh().
962 	 */
963 	nstoken = open_netns(NS_FWD);
964 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
965 		goto done;
966 	err = set_forwarding(true);
967 	close_netns(nstoken);
968 	if (!ASSERT_OK(err, "enable forwarding"))
969 		goto done;
970 
971 	test_tcp_dtime(skel, AF_INET, false);
972 	test_tcp_dtime(skel, AF_INET6, false);
973 	test_udp_dtime(skel, AF_INET, false);
974 	test_udp_dtime(skel, AF_INET6, false);
975 
976 done:
977 	test_tc_dtime__destroy(skel);
978 	close(hold_tstamp_fd);
979 }
980 
981 static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
982 {
983 	struct nstoken *nstoken = NULL;
984 	struct test_tc_neigh_fib *skel = NULL;
985 
986 	nstoken = open_netns(NS_FWD);
987 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
988 		return;
989 
990 	skel = test_tc_neigh_fib__open();
991 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
992 		goto done;
993 
994 	if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load"))
995 		goto done;
996 
997 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
998 			   skel->progs.tc_chk, setup_result))
999 		goto done;
1000 
1001 	/* bpf_fib_lookup() checks if forwarding is enabled */
1002 	if (!ASSERT_OK(set_forwarding(true), "enable forwarding"))
1003 		goto done;
1004 
1005 	test_connectivity();
1006 
1007 done:
1008 	if (skel)
1009 		test_tc_neigh_fib__destroy(skel);
1010 	close_netns(nstoken);
1011 }
1012 
1013 static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
1014 {
1015 	struct nstoken *nstoken = NULL;
1016 	struct test_tc_neigh *skel = NULL;
1017 	int err;
1018 
1019 	nstoken = open_netns(NS_FWD);
1020 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1021 		return;
1022 
1023 	skel = test_tc_neigh__open();
1024 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
1025 		goto done;
1026 
1027 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1028 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1029 
1030 	err = test_tc_neigh__load(skel);
1031 	if (!ASSERT_OK(err, "test_tc_neigh__load"))
1032 		goto done;
1033 
1034 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1035 			   skel->progs.tc_chk, setup_result))
1036 		goto done;
1037 
1038 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1039 		goto done;
1040 
1041 	test_connectivity();
1042 
1043 done:
1044 	if (skel)
1045 		test_tc_neigh__destroy(skel);
1046 	close_netns(nstoken);
1047 }
1048 
1049 static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
1050 {
1051 	struct nstoken *nstoken;
1052 	struct test_tc_peer *skel;
1053 	int err;
1054 
1055 	nstoken = open_netns(NS_FWD);
1056 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1057 		return;
1058 
1059 	skel = test_tc_peer__open();
1060 	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1061 		goto done;
1062 
1063 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1064 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1065 
1066 	err = test_tc_peer__load(skel);
1067 	if (!ASSERT_OK(err, "test_tc_peer__load"))
1068 		goto done;
1069 
1070 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1071 			   skel->progs.tc_chk, setup_result))
1072 		goto done;
1073 
1074 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1075 		goto done;
1076 
1077 	test_connectivity();
1078 
1079 done:
1080 	if (skel)
1081 		test_tc_peer__destroy(skel);
1082 	close_netns(nstoken);
1083 }
1084 
1085 static int tun_open(char *name)
1086 {
1087 	struct ifreq ifr;
1088 	int fd, err;
1089 
1090 	fd = open("/dev/net/tun", O_RDWR);
1091 	if (!ASSERT_GE(fd, 0, "open /dev/net/tun"))
1092 		return -1;
1093 
1094 	memset(&ifr, 0, sizeof(ifr));
1095 
1096 	ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
1097 	if (*name)
1098 		strncpy(ifr.ifr_name, name, IFNAMSIZ);
1099 
1100 	err = ioctl(fd, TUNSETIFF, &ifr);
1101 	if (!ASSERT_OK(err, "ioctl TUNSETIFF"))
1102 		goto fail;
1103 
1104 	SYS(fail, "ip link set dev %s up", name);
1105 
1106 	return fd;
1107 fail:
1108 	close(fd);
1109 	return -1;
1110 }
1111 
1112 enum {
1113 	SRC_TO_TARGET = 0,
1114 	TARGET_TO_SRC = 1,
1115 };
1116 
1117 static int tun_relay_loop(int src_fd, int target_fd)
1118 {
1119 	fd_set rfds, wfds;
1120 
1121 	FD_ZERO(&rfds);
1122 	FD_ZERO(&wfds);
1123 
1124 	for (;;) {
1125 		char buf[1500];
1126 		int direction, nread, nwrite;
1127 
1128 		FD_SET(src_fd, &rfds);
1129 		FD_SET(target_fd, &rfds);
1130 
1131 		if (select(1 + MAX(src_fd, target_fd), &rfds, NULL, NULL, NULL) < 0) {
1132 			log_err("select failed");
1133 			return 1;
1134 		}
1135 
1136 		direction = FD_ISSET(src_fd, &rfds) ? SRC_TO_TARGET : TARGET_TO_SRC;
1137 
1138 		nread = read(direction == SRC_TO_TARGET ? src_fd : target_fd, buf, sizeof(buf));
1139 		if (nread < 0) {
1140 			log_err("read failed");
1141 			return 1;
1142 		}
1143 
1144 		nwrite = write(direction == SRC_TO_TARGET ? target_fd : src_fd, buf, nread);
1145 		if (nwrite != nread) {
1146 			log_err("write failed");
1147 			return 1;
1148 		}
1149 	}
1150 }
1151 
1152 static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
1153 {
1154 	LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd);
1155 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
1156 	struct test_tc_peer *skel = NULL;
1157 	struct nstoken *nstoken = NULL;
1158 	int err;
1159 	int tunnel_pid = -1;
1160 	int src_fd, target_fd = -1;
1161 	int ifindex;
1162 
1163 	/* Start a L3 TUN/TAP tunnel between the src and dst namespaces.
1164 	 * This test is using TUN/TAP instead of e.g. IPIP or GRE tunnel as those
1165 	 * expose the L2 headers encapsulating the IP packet to BPF and hence
1166 	 * don't have skb in suitable state for this test. Alternative to TUN/TAP
1167 	 * would be e.g. Wireguard which would appear as a pure L3 device to BPF,
1168 	 * but that requires much more complicated setup.
1169 	 */
1170 	nstoken = open_netns(NS_SRC);
1171 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
1172 		return;
1173 
1174 	src_fd = tun_open("tun_src");
1175 	if (!ASSERT_GE(src_fd, 0, "tun_open tun_src"))
1176 		goto fail;
1177 
1178 	close_netns(nstoken);
1179 
1180 	nstoken = open_netns(NS_FWD);
1181 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
1182 		goto fail;
1183 
1184 	target_fd = tun_open("tun_fwd");
1185 	if (!ASSERT_GE(target_fd, 0, "tun_open tun_fwd"))
1186 		goto fail;
1187 
1188 	tunnel_pid = fork();
1189 	if (!ASSERT_GE(tunnel_pid, 0, "fork tun_relay_loop"))
1190 		goto fail;
1191 
1192 	if (tunnel_pid == 0)
1193 		exit(tun_relay_loop(src_fd, target_fd));
1194 
1195 	skel = test_tc_peer__open();
1196 	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1197 		goto fail;
1198 
1199 	ifindex = if_nametoindex("tun_fwd");
1200 	if (!ASSERT_GT(ifindex, 0, "if_indextoname tun_fwd"))
1201 		goto fail;
1202 
1203 	skel->rodata->IFINDEX_SRC = ifindex;
1204 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1205 
1206 	err = test_tc_peer__load(skel);
1207 	if (!ASSERT_OK(err, "test_tc_peer__load"))
1208 		goto fail;
1209 
1210 	/* Load "tc_src_l3" to the tun_fwd interface to redirect packets
1211 	 * towards dst, and "tc_dst" to redirect packets
1212 	 * and "tc_chk" on dst_fwd to drop non-redirected packets.
1213 	 */
1214 	/* tc qdisc add dev tun_fwd clsact */
1215 	QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex);
1216 	/* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */
1217 	XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0);
1218 
1219 	/* tc qdisc add dev dst_fwd clsact */
1220 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
1221 	/* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */
1222 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0);
1223 	/* tc filter add dev dst_fwd egress bpf da tc_chk */
1224 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0);
1225 
1226 	/* Setup route and neigh tables */
1227 	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24");
1228 	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP4_TUN_FWD "/24");
1229 
1230 	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad");
1231 	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad");
1232 
1233 	SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global");
1234 	SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD
1235 	    " dev tun_src scope global");
1236 	SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global");
1237 	SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global");
1238 	SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD
1239 	    " dev tun_src scope global");
1240 	SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global");
1241 
1242 	SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1243 	SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1244 
1245 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1246 		goto fail;
1247 
1248 	test_connectivity();
1249 
1250 fail:
1251 	if (tunnel_pid > 0) {
1252 		kill(tunnel_pid, SIGTERM);
1253 		waitpid(tunnel_pid, NULL, 0);
1254 	}
1255 	if (src_fd >= 0)
1256 		close(src_fd);
1257 	if (target_fd >= 0)
1258 		close(target_fd);
1259 	if (skel)
1260 		test_tc_peer__destroy(skel);
1261 	if (nstoken)
1262 		close_netns(nstoken);
1263 }
1264 
1265 #define RUN_TEST(name, mode)                                                                \
1266 	({                                                                                  \
1267 		struct netns_setup_result setup_result = { .dev_mode = mode, };             \
1268 		if (test__start_subtest(#name))                                             \
1269 			if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \
1270 				if (ASSERT_OK(netns_setup_links_and_routes(&setup_result),  \
1271 					      "setup links and routes"))                    \
1272 					test_ ## name(&setup_result);                       \
1273 				netns_setup_namespaces("delete");                           \
1274 			}                                                                   \
1275 	})
1276 
1277 static void *test_tc_redirect_run_tests(void *arg)
1278 {
1279 	netns_setup_namespaces_nofail("delete");
1280 
1281 	RUN_TEST(tc_redirect_peer, MODE_VETH);
1282 	RUN_TEST(tc_redirect_peer, MODE_NETKIT);
1283 	RUN_TEST(tc_redirect_peer_l3, MODE_VETH);
1284 	RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT);
1285 	RUN_TEST(tc_redirect_neigh, MODE_VETH);
1286 	RUN_TEST(tc_redirect_neigh_fib, MODE_VETH);
1287 	RUN_TEST(tc_redirect_dtime, MODE_VETH);
1288 	return NULL;
1289 }
1290 
1291 void test_tc_redirect(void)
1292 {
1293 	pthread_t test_thread;
1294 	int err;
1295 
1296 	/* Run the tests in their own thread to isolate the namespace changes
1297 	 * so they do not affect the environment of other tests.
1298 	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
1299 	 */
1300 	err = pthread_create(&test_thread, NULL, &test_tc_redirect_run_tests, NULL);
1301 	if (ASSERT_OK(err, "pthread_create"))
1302 		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
1303 }
1304