1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Reference program for verifying XDP metadata on real HW. Functional test
4 * only, doesn't test the performance.
5 *
6 * RX:
7 * - UDP 9091 packets are diverted into AF_XDP
8 * - Metadata verified:
9 * - rx_timestamp
10 * - rx_hash
11 *
12 * TX:
13 * - UDP 9091 packets trigger TX reply
14 * - TX HW timestamp is requested and reported back upon completion
15 * - TX checksum is requested
16 * - TX launch time HW offload is requested for transmission
17 */
18
19 #include <test_progs.h>
20 #include <network_helpers.h>
21 #include "xdp_hw_metadata.skel.h"
22 #include "xsk.h"
23
24 #include <error.h>
25 #include <linux/kernel.h>
26 #include <linux/bits.h>
27 #include <linux/bitfield.h>
28 #include <linux/errqueue.h>
29 #include <linux/if_link.h>
30 #include <linux/net_tstamp.h>
31 #include <netinet/udp.h>
32 #include <linux/sockios.h>
33 #include <linux/if_xdp.h>
34 #include <sys/mman.h>
35 #include <net/if.h>
36 #include <ctype.h>
37 #include <poll.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <libgen.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <sys/ioctl.h>
45 #include <linux/pkt_sched.h>
46 #include <linux/pkt_cls.h>
47 #include <linux/ethtool.h>
48 #include <sys/socket.h>
49 #include <arpa/inet.h>
50
51 #include "xdp_metadata.h"
52
53 #define UMEM_NUM 256
54 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
55 #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
56 #define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
57
58 struct xsk {
59 void *umem_area;
60 struct xsk_umem *umem;
61 struct xsk_ring_prod fill;
62 struct xsk_ring_cons comp;
63 struct xsk_ring_prod tx;
64 struct xsk_ring_cons rx;
65 struct xsk_socket *socket;
66 };
67
68 struct xdp_hw_metadata *bpf_obj;
69 __u16 bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
70 struct xsk *rx_xsk;
71 const char *ifname;
72 int ifindex;
73 int rxq;
74 bool skip_tx;
75 __u64 last_hw_rx_timestamp;
76 __u64 last_xdp_rx_timestamp;
77 __u64 last_launch_time;
78 __u64 launch_time_delta_to_hw_rx_timestamp;
79 int launch_time_queue;
80
81 #define run_command(cmd, ...) \
82 ({ \
83 char command[1024]; \
84 memset(command, 0, sizeof(command)); \
85 snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
86 fprintf(stderr, "Running: %s\n", command); \
87 system(command); \
88 })
89
test__fail(void)90 void test__fail(void) { /* for network_helpers.c */ }
91
open_xsk(int ifindex,struct xsk * xsk,__u32 queue_id)92 static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
93 {
94 int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
95 const struct xsk_socket_config socket_config = {
96 .rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
97 .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
98 .bind_flags = bind_flags,
99 };
100 const struct xsk_umem_config umem_config = {
101 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
102 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
103 .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
104 .flags = XDP_UMEM_TX_METADATA_LEN,
105 .tx_metadata_len = sizeof(struct xsk_tx_metadata),
106 };
107 __u32 idx = 0;
108 u64 addr;
109 int ret;
110 int i;
111
112 xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
113 if (xsk->umem_area == MAP_FAILED)
114 return -ENOMEM;
115
116 ret = xsk_umem__create(&xsk->umem,
117 xsk->umem_area, UMEM_SIZE,
118 &xsk->fill,
119 &xsk->comp,
120 &umem_config);
121 if (ret)
122 return ret;
123
124 ret = xsk_socket__create(&xsk->socket, ifindex, queue_id,
125 xsk->umem,
126 &xsk->rx,
127 &xsk->tx,
128 &socket_config);
129 if (ret)
130 return ret;
131
132 /* First half of umem is for TX. This way address matches 1-to-1
133 * to the completion queue index.
134 */
135
136 for (i = 0; i < UMEM_NUM / 2; i++) {
137 addr = i * UMEM_FRAME_SIZE;
138 printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
139 }
140
141 /* Second half of umem is for RX. */
142
143 ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
144 for (i = 0; i < UMEM_NUM / 2; i++) {
145 addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
146 printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
147 *xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr;
148 }
149 xsk_ring_prod__submit(&xsk->fill, ret);
150
151 return 0;
152 }
153
close_xsk(struct xsk * xsk)154 static void close_xsk(struct xsk *xsk)
155 {
156 if (xsk->umem)
157 xsk_umem__delete(xsk->umem);
158 if (xsk->socket)
159 xsk_socket__delete(xsk->socket);
160 munmap(xsk->umem_area, UMEM_SIZE);
161 }
162
refill_rx(struct xsk * xsk,__u64 addr)163 static void refill_rx(struct xsk *xsk, __u64 addr)
164 {
165 __u32 idx;
166
167 if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
168 printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr);
169 *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
170 xsk_ring_prod__submit(&xsk->fill, 1);
171 }
172 }
173
kick_tx(struct xsk * xsk)174 static int kick_tx(struct xsk *xsk)
175 {
176 return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
177 }
178
kick_rx(struct xsk * xsk)179 static int kick_rx(struct xsk *xsk)
180 {
181 return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
182 }
183
184 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
gettime(clockid_t clock_id)185 static __u64 gettime(clockid_t clock_id)
186 {
187 struct timespec t;
188 int res;
189
190 /* See man clock_gettime(2) for type of clock_id's */
191 res = clock_gettime(clock_id, &t);
192
193 if (res < 0)
194 error(res, errno, "Error with clock_gettime()");
195
196 return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
197 }
198
print_tstamp_delta(const char * name,const char * refname,__u64 tstamp,__u64 reference)199 static void print_tstamp_delta(const char *name, const char *refname,
200 __u64 tstamp, __u64 reference)
201 {
202 __s64 delta = (__s64)reference - (__s64)tstamp;
203
204 printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n",
205 name, tstamp, (double)tstamp / NANOSEC_PER_SEC, refname,
206 (double)delta / NANOSEC_PER_SEC,
207 (double)delta / 1000);
208 }
209
210 #define VLAN_PRIO_MASK GENMASK(15, 13) /* Priority Code Point */
211 #define VLAN_DEI_MASK GENMASK(12, 12) /* Drop Eligible Indicator */
212 #define VLAN_VID_MASK GENMASK(11, 0) /* VLAN Identifier */
print_vlan_tci(__u16 tag)213 static void print_vlan_tci(__u16 tag)
214 {
215 __u16 vlan_id = FIELD_GET(VLAN_VID_MASK, tag);
216 __u8 pcp = FIELD_GET(VLAN_PRIO_MASK, tag);
217 bool dei = FIELD_GET(VLAN_DEI_MASK, tag);
218
219 printf("PCP=%u, DEI=%d, VID=0x%X\n", pcp, dei, vlan_id);
220 }
221
verify_xdp_metadata(void * data,clockid_t clock_id)222 static void verify_xdp_metadata(void *data, clockid_t clock_id)
223 {
224 struct xdp_meta *meta;
225
226 meta = data - sizeof(*meta);
227
228 if (meta->hint_valid & XDP_META_FIELD_RSS)
229 printf("rx_hash: 0x%X with RSS type:0x%X\n",
230 meta->rx_hash, meta->rx_hash_type);
231 else
232 printf("No rx_hash, err=%d\n", meta->rx_hash_err);
233
234 if (meta->hint_valid & XDP_META_FIELD_TS) {
235 __u64 ref_tstamp = gettime(clock_id);
236
237 /* store received timestamps to calculate a delta at tx */
238 last_hw_rx_timestamp = meta->rx_timestamp;
239 last_xdp_rx_timestamp = meta->xdp_timestamp;
240
241 print_tstamp_delta("HW RX-time", "User RX-time",
242 meta->rx_timestamp, ref_tstamp);
243 print_tstamp_delta("XDP RX-time", "User RX-time",
244 meta->xdp_timestamp, ref_tstamp);
245 } else {
246 printf("No rx_timestamp, err=%d\n", meta->rx_timestamp_err);
247 }
248
249 if (meta->hint_valid & XDP_META_FIELD_VLAN_TAG) {
250 printf("rx_vlan_proto: 0x%X\n", ntohs(meta->rx_vlan_proto));
251 printf("rx_vlan_tci: ");
252 print_vlan_tci(meta->rx_vlan_tci);
253 } else {
254 printf("No rx_vlan_tci or rx_vlan_proto, err=%d\n",
255 meta->rx_vlan_tag_err);
256 }
257 }
258
verify_skb_metadata(int fd)259 static void verify_skb_metadata(int fd)
260 {
261 char cmsg_buf[1024];
262 char packet_buf[128];
263
264 struct scm_timestamping *ts;
265 struct iovec packet_iov;
266 struct cmsghdr *cmsg;
267 struct msghdr hdr;
268
269 memset(&hdr, 0, sizeof(hdr));
270 hdr.msg_iov = &packet_iov;
271 hdr.msg_iovlen = 1;
272 packet_iov.iov_base = packet_buf;
273 packet_iov.iov_len = sizeof(packet_buf);
274
275 hdr.msg_control = cmsg_buf;
276 hdr.msg_controllen = sizeof(cmsg_buf);
277
278 if (recvmsg(fd, &hdr, 0) < 0)
279 error(1, errno, "recvmsg");
280
281 for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
282 cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
283
284 if (cmsg->cmsg_level != SOL_SOCKET)
285 continue;
286
287 switch (cmsg->cmsg_type) {
288 case SCM_TIMESTAMPING:
289 ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
290 if (ts->ts[2].tv_sec || ts->ts[2].tv_nsec) {
291 printf("found skb hwtstamp = %lu.%lu\n",
292 ts->ts[2].tv_sec, ts->ts[2].tv_nsec);
293 return;
294 }
295 break;
296 default:
297 break;
298 }
299 }
300
301 printf("skb hwtstamp is not found!\n");
302 }
303
complete_tx(struct xsk * xsk,clockid_t clock_id)304 static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
305 {
306 struct xsk_tx_metadata *meta;
307 __u64 addr;
308 void *data;
309 __u32 idx;
310
311 if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
312 return false;
313
314 addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
315 data = xsk_umem__get_data(xsk->umem_area, addr);
316 meta = data - sizeof(struct xsk_tx_metadata);
317
318 printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
319
320 if (meta->completion.tx_timestamp) {
321 __u64 ref_tstamp = gettime(clock_id);
322
323 if (launch_time_delta_to_hw_rx_timestamp) {
324 print_tstamp_delta("HW Launch-time",
325 "HW TX-complete-time",
326 last_launch_time,
327 meta->completion.tx_timestamp);
328 }
329 print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
330 meta->completion.tx_timestamp, ref_tstamp);
331 print_tstamp_delta("XDP RX-time", "User TX-complete-time",
332 last_xdp_rx_timestamp, ref_tstamp);
333 print_tstamp_delta("HW RX-time", "HW TX-complete-time",
334 last_hw_rx_timestamp, meta->completion.tx_timestamp);
335 } else {
336 printf("No tx_timestamp\n");
337 }
338
339 xsk_ring_cons__release(&xsk->comp, 1);
340
341 return true;
342 }
343
344 #define swap(a, b, len) do { \
345 for (int i = 0; i < len; i++) { \
346 __u8 tmp = ((__u8 *)a)[i]; \
347 ((__u8 *)a)[i] = ((__u8 *)b)[i]; \
348 ((__u8 *)b)[i] = tmp; \
349 } \
350 } while (0)
351
ping_pong(struct xsk * xsk,void * rx_packet,clockid_t clock_id)352 static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
353 {
354 struct xsk_tx_metadata *meta;
355 struct ipv6hdr *ip6h = NULL;
356 struct iphdr *iph = NULL;
357 struct xdp_desc *tx_desc;
358 struct udphdr *udph;
359 struct ethhdr *eth;
360 __sum16 want_csum;
361 void *data;
362 __u32 idx;
363 int ret;
364 int len;
365
366 ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
367 if (ret != 1) {
368 printf("%p: failed to reserve tx slot\n", xsk);
369 return;
370 }
371
372 tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
373 tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
374 data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
375
376 meta = data - sizeof(struct xsk_tx_metadata);
377 memset(meta, 0, sizeof(*meta));
378 meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
379
380 eth = rx_packet;
381
382 if (eth->h_proto == htons(ETH_P_IP)) {
383 iph = (void *)(eth + 1);
384 udph = (void *)(iph + 1);
385 } else if (eth->h_proto == htons(ETH_P_IPV6)) {
386 ip6h = (void *)(eth + 1);
387 udph = (void *)(ip6h + 1);
388 } else {
389 printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto);
390 xsk_ring_prod__cancel(&xsk->tx, 1);
391 return;
392 }
393
394 len = ETH_HLEN;
395 if (ip6h)
396 len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
397 if (iph)
398 len += ntohs(iph->tot_len);
399
400 swap(eth->h_dest, eth->h_source, ETH_ALEN);
401 if (iph)
402 swap(&iph->saddr, &iph->daddr, 4);
403 else
404 swap(&ip6h->saddr, &ip6h->daddr, 16);
405 swap(&udph->source, &udph->dest, 2);
406
407 want_csum = udph->check;
408 if (ip6h)
409 udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
410 ntohs(udph->len), IPPROTO_UDP, 0);
411 else
412 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
413 ntohs(udph->len), IPPROTO_UDP, 0);
414
415 meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
416 if (iph)
417 meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
418 else
419 meta->request.csum_start = sizeof(*eth) + sizeof(*ip6h);
420 meta->request.csum_offset = offsetof(struct udphdr, check);
421
422 printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
423 xsk, ntohs(udph->check), ntohs(want_csum),
424 meta->request.csum_start, meta->request.csum_offset);
425
426 /* Set the value of launch time */
427 if (launch_time_delta_to_hw_rx_timestamp) {
428 meta->flags |= XDP_TXMD_FLAGS_LAUNCH_TIME;
429 meta->request.launch_time = last_hw_rx_timestamp +
430 launch_time_delta_to_hw_rx_timestamp;
431 last_launch_time = meta->request.launch_time;
432 print_tstamp_delta("HW RX-time", "HW Launch-time",
433 last_hw_rx_timestamp,
434 meta->request.launch_time);
435 }
436
437 memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
438 tx_desc->options |= XDP_TX_METADATA;
439 tx_desc->len = len;
440
441 xsk_ring_prod__submit(&xsk->tx, 1);
442 }
443
verify_metadata(struct xsk * rx_xsk,int rxq,int server_fd,clockid_t clock_id)444 static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
445 {
446 const struct xdp_desc *rx_desc;
447 struct pollfd fds[rxq + 1];
448 __u64 comp_addr;
449 __u64 deadline;
450 __u64 addr;
451 __u32 idx = 0;
452 int ret;
453 int i;
454
455 for (i = 0; i < rxq; i++) {
456 fds[i].fd = xsk_socket__fd(rx_xsk[i].socket);
457 fds[i].events = POLLIN;
458 fds[i].revents = 0;
459 }
460
461 fds[rxq].fd = server_fd;
462 fds[rxq].events = POLLIN;
463 fds[rxq].revents = 0;
464
465 while (true) {
466 errno = 0;
467
468 for (i = 0; i < rxq; i++) {
469 ret = kick_rx(&rx_xsk[i]);
470 if (ret)
471 printf("kick_rx ret=%d\n", ret);
472 }
473
474 ret = poll(fds, rxq + 1, 1000);
475 printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
476 ret, errno, bpf_obj->bss->pkts_skip,
477 bpf_obj->bss->pkts_fail, bpf_obj->bss->pkts_redir);
478 if (ret < 0)
479 break;
480 if (ret == 0)
481 continue;
482
483 if (fds[rxq].revents)
484 verify_skb_metadata(server_fd);
485
486 for (i = 0; i < rxq; i++) {
487 bool first_seg = true;
488 bool is_eop = true;
489
490 if (fds[i].revents == 0)
491 continue;
492
493 struct xsk *xsk = &rx_xsk[i];
494 peek:
495 ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
496 printf("xsk_ring_cons__peek: %d\n", ret);
497 if (ret != 1)
498 continue;
499
500 rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
501 comp_addr = xsk_umem__extract_addr(rx_desc->addr);
502 addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
503 is_eop = !(rx_desc->options & XDP_PKT_CONTD);
504 printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx%s\n",
505 xsk, idx, rx_desc->addr, addr, comp_addr, is_eop ? " EoP" : "");
506 if (first_seg) {
507 verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
508 clock_id);
509 first_seg = false;
510
511 if (!skip_tx) {
512 /* mirror first chunk back */
513 ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr),
514 clock_id);
515
516 ret = kick_tx(xsk);
517 if (ret)
518 printf("kick_tx ret=%d\n", ret);
519
520 /* wait 1 second + cover launch time */
521 deadline = gettime(clock_id) +
522 NANOSEC_PER_SEC +
523 launch_time_delta_to_hw_rx_timestamp;
524 while (true) {
525 if (complete_tx(xsk, clock_id))
526 break;
527 if (gettime(clock_id) >= deadline)
528 break;
529 usleep(10);
530 }
531 }
532 }
533
534 xsk_ring_cons__release(&xsk->rx, 1);
535 refill_rx(xsk, comp_addr);
536 if (!is_eop)
537 goto peek;
538 }
539 }
540
541 return 0;
542 }
543
rxq_num(const char * ifname)544 static int rxq_num(const char *ifname)
545 {
546 struct ethtool_channels ch = {
547 .cmd = ETHTOOL_GCHANNELS,
548 };
549
550 struct ifreq ifr = {
551 .ifr_data = (void *)&ch,
552 };
553 strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
554 int fd, ret;
555
556 fd = socket(AF_UNIX, SOCK_DGRAM, 0);
557 if (fd < 0)
558 error(1, errno, "socket");
559
560 ret = ioctl(fd, SIOCETHTOOL, &ifr);
561 if (ret < 0)
562 error(1, errno, "ioctl(SIOCETHTOOL)");
563
564 close(fd);
565
566 return ch.rx_count + ch.combined_count;
567 }
568
hwtstamp_ioctl(int op,const char * ifname,struct hwtstamp_config * cfg)569 static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *cfg)
570 {
571 struct ifreq ifr = {
572 .ifr_data = (void *)cfg,
573 };
574 strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
575 int fd, ret;
576
577 fd = socket(AF_UNIX, SOCK_DGRAM, 0);
578 if (fd < 0)
579 error(1, errno, "socket");
580
581 ret = ioctl(fd, op, &ifr);
582 if (ret < 0)
583 error(1, errno, "ioctl(%d)", op);
584
585 close(fd);
586 }
587
588 static struct hwtstamp_config saved_hwtstamp_cfg;
589 static const char *saved_hwtstamp_ifname;
590
hwtstamp_restore(void)591 static void hwtstamp_restore(void)
592 {
593 hwtstamp_ioctl(SIOCSHWTSTAMP, saved_hwtstamp_ifname, &saved_hwtstamp_cfg);
594 }
595
hwtstamp_enable(const char * ifname)596 static void hwtstamp_enable(const char *ifname)
597 {
598 struct hwtstamp_config cfg = {
599 .rx_filter = HWTSTAMP_FILTER_ALL,
600 .tx_type = HWTSTAMP_TX_ON,
601 };
602
603 hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg);
604 saved_hwtstamp_ifname = strdup(ifname);
605 atexit(hwtstamp_restore);
606
607 hwtstamp_ioctl(SIOCSHWTSTAMP, ifname, &cfg);
608 }
609
cleanup(void)610 static void cleanup(void)
611 {
612 LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
613 int ret;
614 int i;
615
616 if (bpf_obj) {
617 opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx);
618 if (opts.old_prog_fd >= 0) {
619 printf("detaching bpf program....\n");
620 ret = bpf_xdp_detach(ifindex, XDP_FLAGS, &opts);
621 if (ret)
622 printf("failed to detach XDP program: %d\n", ret);
623 }
624 }
625
626 for (i = 0; i < rxq; i++)
627 close_xsk(&rx_xsk[i]);
628
629 if (bpf_obj)
630 xdp_hw_metadata__destroy(bpf_obj);
631
632 free((void *)saved_hwtstamp_ifname);
633 }
634
handle_signal(int sig)635 static void handle_signal(int sig)
636 {
637 /* interrupting poll() is all we need */
638 }
639
timestamping_enable(int fd,int val)640 static void timestamping_enable(int fd, int val)
641 {
642 int ret;
643
644 ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val));
645 if (ret < 0)
646 error(1, errno, "setsockopt(SO_TIMESTAMPING)");
647 }
648
print_usage(void)649 static void print_usage(void)
650 {
651 const char *usage =
652 "Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n"
653 " -c Run in copy mode (zerocopy is default)\n"
654 " -h Display this help and exit\n\n"
655 " -m Enable multi-buffer XDP for larger MTU\n"
656 " -r Don't generate AF_XDP reply (rx metadata only)\n"
657 " -l Delta of launch time relative to HW RX-time in ns\n"
658 " default: 0 ns (launch time request is disabled)\n"
659 " -L Tx Queue to be enabled with launch time offload\n"
660 " default: 0 (Tx Queue 0)\n"
661 "Generate test packets on the other machine with:\n"
662 " echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
663
664 printf("%s", usage);
665 }
666
read_args(int argc,char * argv[])667 static void read_args(int argc, char *argv[])
668 {
669 int opt;
670
671 while ((opt = getopt(argc, argv, "chmrl:L:")) != -1) {
672 switch (opt) {
673 case 'c':
674 bind_flags &= ~XDP_USE_NEED_WAKEUP;
675 bind_flags &= ~XDP_ZEROCOPY;
676 bind_flags |= XDP_COPY;
677 break;
678 case 'h':
679 print_usage();
680 exit(0);
681 case 'm':
682 bind_flags |= XDP_USE_SG;
683 break;
684 case 'r':
685 skip_tx = true;
686 break;
687 case 'l':
688 launch_time_delta_to_hw_rx_timestamp = atoll(optarg);
689 break;
690 case 'L':
691 launch_time_queue = atoll(optarg);
692 break;
693 case '?':
694 if (isprint(optopt))
695 fprintf(stderr, "Unknown option: -%c\n", optopt);
696 fallthrough;
697 default:
698 print_usage();
699 error(-1, opterr, "Command line options error");
700 }
701 }
702
703 if (optind >= argc) {
704 fprintf(stderr, "No device name provided\n");
705 print_usage();
706 exit(-1);
707 }
708
709 ifname = argv[optind];
710 ifindex = if_nametoindex(ifname);
711
712 if (!ifname)
713 error(-1, errno, "Invalid interface name");
714 }
715
clean_existing_configurations(void)716 void clean_existing_configurations(void)
717 {
718 /* Check and delete root qdisc if exists */
719 if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc mqprio 8001:'", ifname) == 0)
720 run_command("sudo tc qdisc del dev %s root", ifname);
721
722 /* Check and delete ingress qdisc if exists */
723 if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc ingress ffff:'", ifname) == 0)
724 run_command("sudo tc qdisc del dev %s ingress", ifname);
725
726 /* Check and delete ethtool filters if any exist */
727 if (run_command("sudo ethtool -n %s | grep -q 'Filter:'", ifname) == 0) {
728 run_command("sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 sudo ethtool -N %s delete >&2",
729 ifname, ifname);
730 }
731 }
732
733 #define MAX_TC 16
734
main(int argc,char * argv[])735 int main(int argc, char *argv[])
736 {
737 clockid_t clock_id = CLOCK_TAI;
738 struct bpf_program *prog;
739 int server_fd = -1;
740 size_t map_len = 0;
741 size_t que_len = 0;
742 char *buf = NULL;
743 char *map = NULL;
744 char *que = NULL;
745 char *tmp = NULL;
746 int tc = 0;
747 int ret;
748 int i;
749
750 read_args(argc, argv);
751
752 rxq = rxq_num(ifname);
753 printf("rxq: %d\n", rxq);
754
755 if (launch_time_queue >= rxq || launch_time_queue < 0)
756 error(1, 0, "Invalid launch_time_queue.");
757
758 clean_existing_configurations();
759 sleep(1);
760
761 /* Enable tx and rx hardware timestamping */
762 hwtstamp_enable(ifname);
763
764 /* Prepare priority to traffic class map for tc-mqprio */
765 for (i = 0; i < MAX_TC; i++) {
766 if (i < rxq)
767 tc = i;
768
769 if (asprintf(&buf, "%d ", tc) == -1) {
770 printf("Failed to malloc buf for tc map.\n");
771 goto free_mem;
772 }
773
774 map_len += strlen(buf);
775 tmp = realloc(map, map_len + 1);
776 if (!tmp) {
777 printf("Failed to realloc tc map.\n");
778 goto free_mem;
779 }
780 map = tmp;
781 strcat(map, buf);
782 free(buf);
783 buf = NULL;
784 }
785
786 /* Prepare traffic class to hardware queue map for tc-mqprio */
787 for (i = 0; i <= tc; i++) {
788 if (asprintf(&buf, "1@%d ", i) == -1) {
789 printf("Failed to malloc buf for tc queues.\n");
790 goto free_mem;
791 }
792
793 que_len += strlen(buf);
794 tmp = realloc(que, que_len + 1);
795 if (!tmp) {
796 printf("Failed to realloc tc queues.\n");
797 goto free_mem;
798 }
799 que = tmp;
800 strcat(que, buf);
801 free(buf);
802 buf = NULL;
803 }
804
805 /* Add mqprio qdisc */
806 run_command("sudo tc qdisc add dev %s handle 8001: parent root mqprio num_tc %d map %squeues %shw 0",
807 ifname, tc + 1, map, que);
808
809 /* To test launch time, send UDP packet with VLAN priority 1 to port 9091 */
810 if (launch_time_delta_to_hw_rx_timestamp) {
811 /* Enable launch time hardware offload on launch_time_queue */
812 run_command("sudo tc qdisc replace dev %s parent 8001:%d etf offload clockid CLOCK_TAI delta 500000",
813 ifname, launch_time_queue + 1);
814 sleep(1);
815
816 /* Route incoming packet with VLAN priority 1 into launch_time_queue */
817 if (run_command("sudo ethtool -N %s flow-type ether vlan 0x2000 vlan-mask 0x1FFF action %d",
818 ifname, launch_time_queue)) {
819 run_command("sudo tc qdisc add dev %s ingress", ifname);
820 run_command("sudo tc filter add dev %s parent ffff: protocol 802.1Q flower vlan_prio 1 hw_tc %d",
821 ifname, launch_time_queue);
822 }
823
824 /* Enable VLAN tag stripping offload */
825 run_command("sudo ethtool -K %s rxvlan on", ifname);
826 }
827
828 rx_xsk = malloc(sizeof(struct xsk) * rxq);
829 if (!rx_xsk)
830 error(1, ENOMEM, "malloc");
831
832 for (i = 0; i < rxq; i++) {
833 printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
834 ret = open_xsk(ifindex, &rx_xsk[i], i);
835 if (ret)
836 error(1, -ret, "open_xsk");
837
838 printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket));
839 }
840
841 printf("open bpf program...\n");
842 bpf_obj = xdp_hw_metadata__open();
843 if (libbpf_get_error(bpf_obj))
844 error(1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
845
846 prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
847 bpf_program__set_ifindex(prog, ifindex);
848 bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
849
850 printf("load bpf program...\n");
851 ret = xdp_hw_metadata__load(bpf_obj);
852 if (ret)
853 error(1, -ret, "xdp_hw_metadata__load");
854
855 printf("prepare skb endpoint...\n");
856 server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000);
857 if (server_fd < 0)
858 error(1, errno, "start_server");
859 timestamping_enable(server_fd,
860 SOF_TIMESTAMPING_SOFTWARE |
861 SOF_TIMESTAMPING_RAW_HARDWARE);
862
863 printf("prepare xsk map...\n");
864 for (i = 0; i < rxq; i++) {
865 int sock_fd = xsk_socket__fd(rx_xsk[i].socket);
866 __u32 queue_id = i;
867
868 printf("map[%d] = %d\n", queue_id, sock_fd);
869 ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
870 if (ret)
871 error(1, -ret, "bpf_map_update_elem");
872 }
873
874 printf("attach bpf program...\n");
875 ret = bpf_xdp_attach(ifindex,
876 bpf_program__fd(bpf_obj->progs.rx),
877 XDP_FLAGS, NULL);
878 if (ret)
879 error(1, -ret, "bpf_xdp_attach");
880
881 signal(SIGINT, handle_signal);
882 ret = verify_metadata(rx_xsk, rxq, server_fd, clock_id);
883 close(server_fd);
884 cleanup();
885 if (ret)
886 error(1, -ret, "verify_metadata");
887
888 clean_existing_configurations();
889
890 free_mem:
891 free(buf);
892 free(map);
893 free(que);
894 }
895