xref: /linux/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <test_progs.h>
3 #include <network_helpers.h>
4 #include "xdp_metadata.skel.h"
5 #include "xdp_metadata2.skel.h"
6 #include "xdp_metadata.h"
7 #include "xsk.h"
8 
9 #include <bpf/btf.h>
10 #include <linux/errqueue.h>
11 #include <linux/if_link.h>
12 #include <linux/net_tstamp.h>
13 #include <linux/udp.h>
14 #include <sys/mman.h>
15 #include <net/if.h>
16 #include <poll.h>
17 
18 #define TX_NAME "veTX"
19 #define RX_NAME "veRX"
20 
21 #define UDP_PAYLOAD_BYTES 4
22 
23 #define UDP_SOURCE_PORT 1234
24 #define AF_XDP_CONSUMER_PORT 8080
25 
26 #define UMEM_NUM 16
27 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
28 #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
29 #define XDP_FLAGS XDP_FLAGS_DRV_MODE
30 #define QUEUE_ID 0
31 
32 #define TX_ADDR "10.0.0.1"
33 #define RX_ADDR "10.0.0.2"
34 #define PREFIX_LEN "8"
35 #define FAMILY AF_INET
36 #define TX_NETNS_NAME "xdp_metadata_tx"
37 #define RX_NETNS_NAME "xdp_metadata_rx"
38 #define TX_MAC "00:00:00:00:00:01"
39 #define RX_MAC "00:00:00:00:00:02"
40 
41 #define VLAN_ID 59
42 #define VLAN_PROTO "802.1Q"
43 #define VLAN_PID htons(ETH_P_8021Q)
44 #define TX_NAME_VLAN TX_NAME "." TO_STR(VLAN_ID)
45 
46 #define XDP_RSS_TYPE_L4 BIT(3)
47 #define VLAN_VID_MASK 0xfff
48 
49 struct xsk {
50 	void *umem_area;
51 	struct xsk_umem *umem;
52 	struct xsk_ring_prod fill;
53 	struct xsk_ring_cons comp;
54 	struct xsk_ring_prod tx;
55 	struct xsk_ring_cons rx;
56 	struct xsk_socket *socket;
57 };
58 
59 static int open_xsk(int ifindex, struct xsk *xsk)
60 {
61 	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
62 	const struct xsk_socket_config socket_config = {
63 		.rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
64 		.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
65 		.bind_flags = XDP_COPY,
66 	};
67 	const struct xsk_umem_config umem_config = {
68 		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
69 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
70 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
71 		.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM,
72 		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
73 	};
74 	__u32 idx;
75 	u64 addr;
76 	int ret;
77 	int i;
78 
79 	xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
80 	if (!ASSERT_NEQ(xsk->umem_area, MAP_FAILED, "mmap"))
81 		return -1;
82 
83 	ret = xsk_umem__create(&xsk->umem,
84 			       xsk->umem_area, UMEM_SIZE,
85 			       &xsk->fill,
86 			       &xsk->comp,
87 			       &umem_config);
88 	if (!ASSERT_OK(ret, "xsk_umem__create"))
89 		return ret;
90 
91 	ret = xsk_socket__create(&xsk->socket, ifindex, QUEUE_ID,
92 				 xsk->umem,
93 				 &xsk->rx,
94 				 &xsk->tx,
95 				 &socket_config);
96 	if (!ASSERT_OK(ret, "xsk_socket__create"))
97 		return ret;
98 
99 	/* First half of umem is for TX. This way address matches 1-to-1
100 	 * to the completion queue index.
101 	 */
102 
103 	for (i = 0; i < UMEM_NUM / 2; i++) {
104 		addr = i * UMEM_FRAME_SIZE;
105 		printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
106 	}
107 
108 	/* Second half of umem is for RX. */
109 
110 	ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
111 	if (!ASSERT_EQ(UMEM_NUM / 2, ret, "xsk_ring_prod__reserve"))
112 		return ret;
113 	if (!ASSERT_EQ(idx, 0, "fill idx != 0"))
114 		return -1;
115 
116 	for (i = 0; i < UMEM_NUM / 2; i++) {
117 		addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
118 		printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
119 		*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
120 	}
121 	xsk_ring_prod__submit(&xsk->fill, ret);
122 
123 	return 0;
124 }
125 
126 static void close_xsk(struct xsk *xsk)
127 {
128 	if (xsk->umem)
129 		xsk_umem__delete(xsk->umem);
130 	if (xsk->socket)
131 		xsk_socket__delete(xsk->socket);
132 	munmap(xsk->umem_area, UMEM_SIZE);
133 }
134 
135 static void ip_csum(struct iphdr *iph)
136 {
137 	__u32 sum = 0;
138 	__u16 *p;
139 	int i;
140 
141 	iph->check = 0;
142 	p = (void *)iph;
143 	for (i = 0; i < sizeof(*iph) / sizeof(*p); i++)
144 		sum += p[i];
145 
146 	while (sum >> 16)
147 		sum = (sum & 0xffff) + (sum >> 16);
148 
149 	iph->check = ~sum;
150 }
151 
152 static int generate_packet(struct xsk *xsk, __u16 dst_port)
153 {
154 	struct xsk_tx_metadata *meta;
155 	struct xdp_desc *tx_desc;
156 	struct udphdr *udph;
157 	struct ethhdr *eth;
158 	struct iphdr *iph;
159 	void *data;
160 	__u32 idx;
161 	int ret;
162 
163 	ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
164 	if (!ASSERT_EQ(ret, 1, "xsk_ring_prod__reserve"))
165 		return -1;
166 
167 	tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
168 	tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
169 	printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr);
170 	data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
171 
172 	meta = data - sizeof(struct xsk_tx_metadata);
173 	memset(meta, 0, sizeof(*meta));
174 	meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
175 
176 	eth = data;
177 	iph = (void *)(eth + 1);
178 	udph = (void *)(iph + 1);
179 
180 	memcpy(eth->h_dest, "\x00\x00\x00\x00\x00\x02", ETH_ALEN);
181 	memcpy(eth->h_source, "\x00\x00\x00\x00\x00\x01", ETH_ALEN);
182 	eth->h_proto = htons(ETH_P_IP);
183 
184 	iph->version = 0x4;
185 	iph->ihl = 0x5;
186 	iph->tos = 0x9;
187 	iph->tot_len = htons(sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES);
188 	iph->id = 0;
189 	iph->frag_off = 0;
190 	iph->ttl = 0;
191 	iph->protocol = IPPROTO_UDP;
192 	ASSERT_EQ(inet_pton(FAMILY, TX_ADDR, &iph->saddr), 1, "inet_pton(TX_ADDR)");
193 	ASSERT_EQ(inet_pton(FAMILY, RX_ADDR, &iph->daddr), 1, "inet_pton(RX_ADDR)");
194 	ip_csum(iph);
195 
196 	udph->source = htons(UDP_SOURCE_PORT);
197 	udph->dest = htons(dst_port);
198 	udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES);
199 	udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
200 					 ntohs(udph->len), IPPROTO_UDP, 0);
201 
202 	memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES);
203 
204 	meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
205 	meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
206 	meta->request.csum_offset = offsetof(struct udphdr, check);
207 
208 	tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES;
209 	tx_desc->options |= XDP_TX_METADATA;
210 	xsk_ring_prod__submit(&xsk->tx, 1);
211 
212 	ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
213 	if (!ASSERT_GE(ret, 0, "sendto"))
214 		return ret;
215 
216 	return 0;
217 }
218 
219 static int generate_packet_inet(void)
220 {
221 	char udp_payload[UDP_PAYLOAD_BYTES];
222 	struct sockaddr_in rx_addr;
223 	int sock_fd, err = 0;
224 
225 	/* Build a packet */
226 	memset(udp_payload, 0xAA, UDP_PAYLOAD_BYTES);
227 	rx_addr.sin_addr.s_addr = inet_addr(RX_ADDR);
228 	rx_addr.sin_family = AF_INET;
229 	rx_addr.sin_port = htons(AF_XDP_CONSUMER_PORT);
230 
231 	sock_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
232 	if (!ASSERT_GE(sock_fd, 0, "socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)"))
233 		return sock_fd;
234 
235 	err = sendto(sock_fd, udp_payload, UDP_PAYLOAD_BYTES, MSG_DONTWAIT,
236 		     (void *)&rx_addr, sizeof(rx_addr));
237 	ASSERT_GE(err, 0, "sendto");
238 
239 	close(sock_fd);
240 	return err;
241 }
242 
243 static void complete_tx(struct xsk *xsk)
244 {
245 	struct xsk_tx_metadata *meta;
246 	__u64 addr;
247 	void *data;
248 	__u32 idx;
249 
250 	if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
251 		addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
252 
253 		printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
254 
255 		data = xsk_umem__get_data(xsk->umem_area, addr);
256 		meta = data - sizeof(struct xsk_tx_metadata);
257 
258 		ASSERT_NEQ(meta->completion.tx_timestamp, 0, "tx_timestamp");
259 
260 		xsk_ring_cons__release(&xsk->comp, 1);
261 	}
262 }
263 
264 static void refill_rx(struct xsk *xsk, __u64 addr)
265 {
266 	__u32 idx;
267 
268 	if (ASSERT_EQ(xsk_ring_prod__reserve(&xsk->fill, 1, &idx), 1, "xsk_ring_prod__reserve")) {
269 		printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
270 		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
271 		xsk_ring_prod__submit(&xsk->fill, 1);
272 	}
273 }
274 
275 static int verify_xsk_metadata(struct xsk *xsk, bool sent_from_af_xdp)
276 {
277 	const struct xdp_desc *rx_desc;
278 	struct pollfd fds = {};
279 	struct xdp_meta *meta;
280 	struct udphdr *udph;
281 	struct ethhdr *eth;
282 	struct iphdr *iph;
283 	__u64 comp_addr;
284 	void *data;
285 	__u64 addr;
286 	__u32 idx = 0;
287 	int ret;
288 
289 	ret = recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
290 	if (!ASSERT_EQ(ret, 0, "recvfrom"))
291 		return -1;
292 
293 	fds.fd = xsk_socket__fd(xsk->socket);
294 	fds.events = POLLIN;
295 
296 	ret = poll(&fds, 1, 1000);
297 	if (!ASSERT_GT(ret, 0, "poll"))
298 		return -1;
299 
300 	ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
301 	if (!ASSERT_EQ(ret, 1, "xsk_ring_cons__peek"))
302 		return -2;
303 
304 	rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
305 	comp_addr = xsk_umem__extract_addr(rx_desc->addr);
306 	addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
307 	printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
308 	       xsk, idx, rx_desc->addr, addr, comp_addr);
309 	data = xsk_umem__get_data(xsk->umem_area, addr);
310 
311 	/* Make sure we got the packet offset correctly. */
312 
313 	eth = data;
314 	ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto");
315 	iph = (void *)(eth + 1);
316 	ASSERT_EQ((int)iph->version, 4, "iph->version");
317 	udph = (void *)(iph + 1);
318 
319 	/* custom metadata */
320 
321 	meta = data - sizeof(struct xdp_meta);
322 
323 	if (!ASSERT_NEQ(meta->rx_timestamp, 0, "rx_timestamp"))
324 		return -1;
325 
326 	if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
327 		return -1;
328 
329 	if (!sent_from_af_xdp) {
330 		if (!ASSERT_NEQ(meta->rx_hash_type & XDP_RSS_TYPE_L4, 0, "rx_hash_type"))
331 			return -1;
332 
333 		if (!ASSERT_EQ(meta->rx_vlan_tci & VLAN_VID_MASK, VLAN_ID, "rx_vlan_tci"))
334 			return -1;
335 
336 		if (!ASSERT_EQ(meta->rx_vlan_proto, VLAN_PID, "rx_vlan_proto"))
337 			return -1;
338 		goto done;
339 	}
340 
341 	ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type");
342 
343 	/* checksum offload */
344 	ASSERT_EQ(udph->check, htons(0x721c), "csum");
345 
346 done:
347 	xsk_ring_cons__release(&xsk->rx, 1);
348 	refill_rx(xsk, comp_addr);
349 
350 	return 0;
351 }
352 
353 static void switch_ns_to_rx(struct nstoken **tok)
354 {
355 	close_netns(*tok);
356 	*tok = open_netns(RX_NETNS_NAME);
357 }
358 
359 static void switch_ns_to_tx(struct nstoken **tok)
360 {
361 	close_netns(*tok);
362 	*tok = open_netns(TX_NETNS_NAME);
363 }
364 
365 void test_xdp_metadata(void)
366 {
367 	struct xdp_metadata2 *bpf_obj2 = NULL;
368 	struct xdp_metadata *bpf_obj = NULL;
369 	struct bpf_program *new_prog, *prog;
370 	struct nstoken *tok = NULL;
371 	__u32 queue_id = QUEUE_ID;
372 	struct bpf_map *prog_arr;
373 	struct xsk tx_xsk = {};
374 	struct xsk rx_xsk = {};
375 	__u32 val, key = 0;
376 	int retries = 10;
377 	int rx_ifindex;
378 	int tx_ifindex;
379 	int sock_fd;
380 	int ret;
381 
382 	/* Setup new networking namespaces, with a veth pair. */
383 	SYS(out, "ip netns add " TX_NETNS_NAME);
384 	SYS(out, "ip netns add " RX_NETNS_NAME);
385 
386 	tok = open_netns(TX_NETNS_NAME);
387 	if (!ASSERT_OK_PTR(tok, "setns"))
388 		goto out;
389 	SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME
390 	    " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1");
391 	SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME);
392 
393 	SYS(out, "ip link set dev " TX_NAME " address " TX_MAC);
394 	SYS(out, "ip link set dev " TX_NAME " up");
395 
396 	SYS(out, "ip link add link " TX_NAME " " TX_NAME_VLAN
397 		 " type vlan proto " VLAN_PROTO " id " TO_STR(VLAN_ID));
398 	SYS(out, "ip link set dev " TX_NAME_VLAN " up");
399 	SYS(out, "ip addr add " TX_ADDR "/" PREFIX_LEN " dev " TX_NAME_VLAN);
400 
401 	/* Avoid ARP calls */
402 	SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN);
403 
404 	switch_ns_to_rx(&tok);
405 	if (!ASSERT_OK_PTR(tok, "setns rx"))
406 		goto out;
407 
408 	SYS(out, "ip link set dev " RX_NAME " address " RX_MAC);
409 	SYS(out, "ip link set dev " RX_NAME " up");
410 	SYS(out, "ip addr add " RX_ADDR "/" PREFIX_LEN " dev " RX_NAME);
411 
412 	rx_ifindex = if_nametoindex(RX_NAME);
413 
414 	/* Setup separate AF_XDP for RX interface. */
415 
416 	ret = open_xsk(rx_ifindex, &rx_xsk);
417 	if (!ASSERT_OK(ret, "open_xsk(RX_NAME)"))
418 		goto out;
419 
420 	bpf_obj = xdp_metadata__open();
421 	if (!ASSERT_OK_PTR(bpf_obj, "open skeleton"))
422 		goto out;
423 
424 	prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
425 	bpf_program__set_ifindex(prog, rx_ifindex);
426 	bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
427 
428 	if (!ASSERT_OK(xdp_metadata__load(bpf_obj), "load skeleton"))
429 		goto out;
430 
431 	/* Make sure we can't add dev-bound programs to prog maps. */
432 	prog_arr = bpf_object__find_map_by_name(bpf_obj->obj, "prog_arr");
433 	if (!ASSERT_OK_PTR(prog_arr, "no prog_arr map"))
434 		goto out;
435 
436 	val = bpf_program__fd(prog);
437 	if (!ASSERT_ERR(bpf_map__update_elem(prog_arr, &key, sizeof(key),
438 					     &val, sizeof(val), BPF_ANY),
439 			"update prog_arr"))
440 		goto out;
441 
442 	/* Attach BPF program to RX interface. */
443 
444 	ret = bpf_xdp_attach(rx_ifindex,
445 			     bpf_program__fd(bpf_obj->progs.rx),
446 			     XDP_FLAGS, NULL);
447 	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
448 		goto out;
449 
450 	sock_fd = xsk_socket__fd(rx_xsk.socket);
451 	ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
452 	if (!ASSERT_GE(ret, 0, "bpf_map_update_elem"))
453 		goto out;
454 
455 	switch_ns_to_tx(&tok);
456 	if (!ASSERT_OK_PTR(tok, "setns tx"))
457 		goto out;
458 
459 	/* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */
460 	tx_ifindex = if_nametoindex(TX_NAME);
461 	ret = open_xsk(tx_ifindex, &tx_xsk);
462 	if (!ASSERT_OK(ret, "open_xsk(TX_NAME)"))
463 		goto out;
464 
465 	if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
466 		       "generate AF_XDP_CONSUMER_PORT"))
467 		goto out;
468 
469 	switch_ns_to_rx(&tok);
470 	if (!ASSERT_OK_PTR(tok, "setns rx"))
471 		goto out;
472 
473 	/* Verify packet sent from AF_XDP has proper metadata. */
474 	if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0,
475 		       "verify_xsk_metadata"))
476 		goto out;
477 
478 	switch_ns_to_tx(&tok);
479 	if (!ASSERT_OK_PTR(tok, "setns tx"))
480 		goto out;
481 	complete_tx(&tx_xsk);
482 
483 	/* Now check metadata of packet, generated with network stack */
484 	if (!ASSERT_GE(generate_packet_inet(), 0, "generate UDP packet"))
485 		goto out;
486 
487 	switch_ns_to_rx(&tok);
488 	if (!ASSERT_OK_PTR(tok, "setns rx"))
489 		goto out;
490 
491 	if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0,
492 		       "verify_xsk_metadata"))
493 		goto out;
494 
495 	/* Make sure freplace correctly picks up original bound device
496 	 * and doesn't crash.
497 	 */
498 
499 	bpf_obj2 = xdp_metadata2__open();
500 	if (!ASSERT_OK_PTR(bpf_obj2, "open skeleton"))
501 		goto out;
502 
503 	new_prog = bpf_object__find_program_by_name(bpf_obj2->obj, "freplace_rx");
504 	bpf_program__set_attach_target(new_prog, bpf_program__fd(prog), "rx");
505 
506 	if (!ASSERT_OK(xdp_metadata2__load(bpf_obj2), "load freplace skeleton"))
507 		goto out;
508 
509 	if (!ASSERT_OK(xdp_metadata2__attach(bpf_obj2), "attach freplace"))
510 		goto out;
511 
512 	switch_ns_to_tx(&tok);
513 	if (!ASSERT_OK_PTR(tok, "setns tx"))
514 		goto out;
515 
516 	/* Send packet to trigger . */
517 	if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
518 		       "generate freplace packet"))
519 		goto out;
520 
521 	switch_ns_to_rx(&tok);
522 	if (!ASSERT_OK_PTR(tok, "setns rx"))
523 		goto out;
524 
525 	while (!retries--) {
526 		if (bpf_obj2->bss->called)
527 			break;
528 		usleep(10);
529 	}
530 	ASSERT_GT(bpf_obj2->bss->called, 0, "not called");
531 
532 out:
533 	close_xsk(&rx_xsk);
534 	close_xsk(&tx_xsk);
535 	xdp_metadata2__destroy(bpf_obj2);
536 	xdp_metadata__destroy(bpf_obj);
537 	if (tok)
538 		close_netns(tok);
539 	SYS_NOFAIL("ip netns del " RX_NETNS_NAME);
540 	SYS_NOFAIL("ip netns del " TX_NETNS_NAME);
541 }
542