1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ 3 4 #include <argp.h> 5 #include <string.h> 6 #include <arpa/inet.h> 7 #include <linux/if_ether.h> 8 #include <linux/ip.h> 9 #include <linux/ipv6.h> 10 #include <linux/in.h> 11 #include <linux/tcp.h> 12 #include <linux/udp.h> 13 #include "bench.h" 14 #include "bench_bpf_timing.h" 15 #include "xdp_lb_bench.skel.h" 16 #include "xdp_lb_bench_common.h" 17 #include "bpf_util.h" 18 19 #define IP4(a, b, c, d) (((__u32)(a) << 24) | ((__u32)(b) << 16) | ((__u32)(c) << 8) | (__u32)(d)) 20 21 #define IP6(a, b, c, d) { (__u32)(a), (__u32)(b), (__u32)(c), (__u32)(d) } 22 23 #define TNL_DST IP4(192, 168, 1, 2) 24 #define REAL_INDEX 1 25 #define REAL_INDEX_V6 2 26 #define MAX_PKT_SIZE 256 27 #define IP_MF 0x2000 28 29 static const __u32 tnl_dst_v6[4] = { 0xfd000000, 0, 0, 2 }; 30 31 static const __u8 lb_mac[ETH_ALEN] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}; 32 static const __u8 client_mac[ETH_ALEN] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66}; 33 static const __u8 router_mac[ETH_ALEN] = {0xde, 0xad, 0xbe, 0xef, 0x00, 0x01}; 34 35 enum scenario_id { 36 S_TCP_V4_LRU_HIT, 37 S_TCP_V4_CH, 38 S_TCP_V6_LRU_HIT, 39 S_TCP_V6_CH, 40 S_UDP_V4_LRU_HIT, 41 S_UDP_V6_LRU_HIT, 42 S_TCP_V4V6_LRU_HIT, 43 S_TCP_V4_LRU_DIVERSE, 44 S_TCP_V4_CH_DIVERSE, 45 S_TCP_V6_LRU_DIVERSE, 46 S_TCP_V6_CH_DIVERSE, 47 S_UDP_V4_LRU_DIVERSE, 48 S_TCP_V4_LRU_MISS, 49 S_UDP_V4_LRU_MISS, 50 S_TCP_V4_LRU_WARMUP, 51 S_TCP_V4_SYN, 52 S_TCP_V4_RST_MISS, 53 S_PASS_V4_NO_VIP, 54 S_PASS_V6_NO_VIP, 55 S_PASS_V4_ICMP, 56 S_PASS_NON_IP, 57 S_DROP_V4_FRAG, 58 S_DROP_V4_OPTIONS, 59 S_DROP_V6_FRAG, 60 NUM_SCENARIOS, 61 }; 62 63 enum lru_miss_type { 64 LRU_MISS_AUTO = 0, /* compute from scenario flags (default) */ 65 LRU_MISS_NONE, /* 0 misses (all LRU hits) */ 66 LRU_MISS_ALL, /* batch_iters+1 misses (every op misses) */ 67 LRU_MISS_FIRST, /* 1 miss (first miss, then hits) */ 68 }; 69 70 #define S_BASE_ENCAP_V4 \ 71 .expected_retval = XDP_TX, .expect_encap = true, \ 72 .tunnel_dst = TNL_DST 73 74 #define S_BASE_ENCAP_V6 \ 75 .expected_retval = XDP_TX, .expect_encap = true, \ 76 .is_v6 = true, .encap_v6_outer = true, \ 77 .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } 78 79 #define S_BASE_ENCAP_V4V6 \ 80 .expected_retval = XDP_TX, .expect_encap = true, \ 81 .encap_v6_outer = true, \ 82 .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } 83 84 struct test_scenario { 85 const char *name; 86 const char *description; 87 int expected_retval; 88 bool expect_encap; 89 bool is_v6; 90 __u32 vip_addr; 91 __u32 src_addr; 92 __u32 tunnel_dst; 93 __u32 vip_addr_v6[4]; 94 __u32 src_addr_v6[4]; 95 __u32 tunnel_dst_v6[4]; 96 __u16 dst_port; 97 __u16 src_port; 98 __u8 ip_proto; 99 __u32 vip_flags; 100 __u32 vip_num; 101 bool prepopulate_lru; 102 bool set_frag; 103 __u16 eth_proto; 104 bool encap_v6_outer; 105 __u32 flow_mask; 106 bool cold_lru; 107 bool set_syn; 108 bool set_rst; 109 bool set_ip_options; 110 __u32 fixed_batch_iters; /* 0 = auto-calibrate, >0 = use this value */ 111 enum lru_miss_type lru_miss; /* expected LRU miss pattern */ 112 }; 113 114 static const struct test_scenario scenarios[NUM_SCENARIOS] = { 115 /* Single-flow baseline */ 116 [S_TCP_V4_LRU_HIT] = { 117 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 118 .name = "tcp-v4-lru-hit", 119 .description = "IPv4 TCP, LRU hit, IPIP encap", 120 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 121 .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, 122 .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, 123 }, 124 [S_TCP_V4_CH] = { 125 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 126 .name = "tcp-v4-ch", 127 .description = "IPv4 TCP, CH (LRU bypass), IPIP encap", 128 .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, 129 .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, 130 .vip_flags = F_LRU_BYPASS, .vip_num = 1, 131 .lru_miss = LRU_MISS_ALL, 132 }, 133 [S_TCP_V6_LRU_HIT] = { 134 S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, 135 .name = "tcp-v6-lru-hit", 136 .description = "IPv6 TCP, LRU hit, IP6IP6 encap", 137 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, 138 .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, 139 .vip_num = 10, 140 .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, 141 }, 142 [S_TCP_V6_CH] = { 143 S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, 144 .name = "tcp-v6-ch", 145 .description = "IPv6 TCP, CH (LRU bypass), IP6IP6 encap", 146 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, 147 .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, 148 .vip_flags = F_LRU_BYPASS, .vip_num = 12, 149 .lru_miss = LRU_MISS_ALL, 150 }, 151 [S_UDP_V4_LRU_HIT] = { 152 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, 153 .name = "udp-v4-lru-hit", 154 .description = "IPv4 UDP, LRU hit, IPIP encap", 155 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, 156 .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, 157 .vip_num = 2, 158 .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, 159 }, 160 [S_UDP_V6_LRU_HIT] = { 161 S_BASE_ENCAP_V6, .ip_proto = IPPROTO_UDP, 162 .name = "udp-v6-lru-hit", 163 .description = "IPv6 UDP, LRU hit, IP6IP6 encap", 164 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 443, 165 .src_addr_v6 = IP6(0xfd000200, 0, 0, 3), .src_port = 22222, 166 .vip_num = 14, 167 .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, 168 }, 169 [S_TCP_V4V6_LRU_HIT] = { 170 S_BASE_ENCAP_V4V6, .ip_proto = IPPROTO_TCP, 171 .name = "tcp-v4v6-lru-hit", 172 .description = "IPv4 TCP, LRU hit, IPv4-in-IPv6 encap", 173 .vip_addr = IP4(10, 10, 1, 4), .dst_port = 80, 174 .src_addr = IP4(10, 10, 2, 4), .src_port = 12347, 175 .vip_num = 13, 176 .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, 177 }, 178 179 /* Diverse flows (4K src addrs) */ 180 [S_TCP_V4_LRU_DIVERSE] = { 181 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 182 .name = "tcp-v4-lru-diverse", 183 .description = "IPv4 TCP, diverse flows, warm LRU", 184 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 185 .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, 186 .prepopulate_lru = true, .flow_mask = 0xFFF, 187 .lru_miss = LRU_MISS_NONE, 188 }, 189 [S_TCP_V4_CH_DIVERSE] = { 190 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 191 .name = "tcp-v4-ch-diverse", 192 .description = "IPv4 TCP, diverse flows, CH (LRU bypass)", 193 .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, 194 .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, 195 .vip_flags = F_LRU_BYPASS, .vip_num = 1, 196 .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, 197 }, 198 [S_TCP_V6_LRU_DIVERSE] = { 199 S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, 200 .name = "tcp-v6-lru-diverse", 201 .description = "IPv6 TCP, diverse flows, warm LRU", 202 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, 203 .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, 204 .vip_num = 10, 205 .prepopulate_lru = true, .flow_mask = 0xFFF, 206 .lru_miss = LRU_MISS_NONE, 207 }, 208 [S_TCP_V6_CH_DIVERSE] = { 209 S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, 210 .name = "tcp-v6-ch-diverse", 211 .description = "IPv6 TCP, diverse flows, CH (LRU bypass)", 212 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, 213 .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, 214 .vip_flags = F_LRU_BYPASS, .vip_num = 12, 215 .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, 216 }, 217 [S_UDP_V4_LRU_DIVERSE] = { 218 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, 219 .name = "udp-v4-lru-diverse", 220 .description = "IPv4 UDP, diverse flows, warm LRU", 221 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, 222 .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, 223 .vip_num = 2, 224 .prepopulate_lru = true, .flow_mask = 0xFFF, 225 .lru_miss = LRU_MISS_NONE, 226 }, 227 228 /* LRU stress */ 229 [S_TCP_V4_LRU_MISS] = { 230 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 231 .name = "tcp-v4-lru-miss", 232 .description = "IPv4 TCP, LRU miss (16M flow space), CH lookup", 233 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 234 .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, 235 .flow_mask = 0xFFFFFF, .cold_lru = true, 236 .lru_miss = LRU_MISS_FIRST, 237 }, 238 [S_UDP_V4_LRU_MISS] = { 239 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, 240 .name = "udp-v4-lru-miss", 241 .description = "IPv4 UDP, LRU miss (16M flow space), CH lookup", 242 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, 243 .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, 244 .vip_num = 2, 245 .flow_mask = 0xFFFFFF, .cold_lru = true, 246 .lru_miss = LRU_MISS_FIRST, 247 }, 248 [S_TCP_V4_LRU_WARMUP] = { 249 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 250 .name = "tcp-v4-lru-warmup", 251 .description = "IPv4 TCP, 4K flows, ~50% LRU miss", 252 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 253 .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, 254 .flow_mask = 0xFFF, .cold_lru = true, 255 .fixed_batch_iters = 6500, 256 .lru_miss = LRU_MISS_FIRST, 257 }, 258 259 /* TCP flags */ 260 [S_TCP_V4_SYN] = { 261 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 262 .name = "tcp-v4-syn", 263 .description = "IPv4 TCP SYN, skip LRU, CH + LRU insert", 264 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 265 .src_addr = IP4(10, 10, 8, 2), .src_port = 60001, 266 .set_syn = true, .lru_miss = LRU_MISS_ALL, 267 }, 268 [S_TCP_V4_RST_MISS] = { 269 S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, 270 .name = "tcp-v4-rst-miss", 271 .description = "IPv4 TCP RST, CH lookup, no LRU insert", 272 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 273 .src_addr = IP4(10, 10, 8, 1), .src_port = 60000, 274 .flow_mask = 0xFFFFFF, .cold_lru = true, 275 .set_rst = true, .lru_miss = LRU_MISS_ALL, 276 }, 277 278 /* Early exits */ 279 [S_PASS_V4_NO_VIP] = { 280 .name = "pass-v4-no-vip", 281 .description = "IPv4 TCP, unknown VIP, XDP_PASS", 282 .expected_retval = XDP_PASS, 283 .ip_proto = IPPROTO_TCP, 284 .vip_addr = IP4(10, 10, 9, 9), .dst_port = 80, 285 .src_addr = IP4(10, 10, 4, 1), .src_port = 33333, 286 }, 287 [S_PASS_V6_NO_VIP] = { 288 .name = "pass-v6-no-vip", 289 .description = "IPv6 TCP, unknown VIP, XDP_PASS", 290 .expected_retval = XDP_PASS, .is_v6 = true, 291 .ip_proto = IPPROTO_TCP, 292 .vip_addr_v6 = IP6(0xfd009900, 0, 0, 1), .dst_port = 80, 293 .src_addr_v6 = IP6(0xfd000400, 0, 0, 1), .src_port = 33333, 294 }, 295 [S_PASS_V4_ICMP] = { 296 .name = "pass-v4-icmp", 297 .description = "IPv4 ICMP, non-TCP/UDP protocol, XDP_PASS", 298 .expected_retval = XDP_PASS, 299 .ip_proto = IPPROTO_ICMP, 300 .vip_addr = IP4(10, 10, 1, 1), 301 .src_addr = IP4(10, 10, 6, 1), 302 }, 303 [S_PASS_NON_IP] = { 304 .name = "pass-non-ip", 305 .description = "Non-IP (ARP), earliest XDP_PASS exit", 306 .expected_retval = XDP_PASS, 307 .eth_proto = ETH_P_ARP, 308 }, 309 [S_DROP_V4_FRAG] = { 310 .name = "drop-v4-frag", 311 .description = "IPv4 fragmented, XDP_DROP", 312 .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, 313 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 314 .src_addr = IP4(10, 10, 5, 1), .src_port = 44444, 315 .set_frag = true, 316 }, 317 [S_DROP_V4_OPTIONS] = { 318 .name = "drop-v4-options", 319 .description = "IPv4 with IP options (ihl>5), XDP_DROP", 320 .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, 321 .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, 322 .src_addr = IP4(10, 10, 7, 1), .src_port = 55555, 323 .set_ip_options = true, 324 }, 325 [S_DROP_V6_FRAG] = { 326 .name = "drop-v6-frag", 327 .description = "IPv6 fragment extension header, XDP_DROP", 328 .expected_retval = XDP_DROP, .is_v6 = true, 329 .ip_proto = IPPROTO_TCP, 330 .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, 331 .src_addr_v6 = IP6(0xfd000500, 0, 0, 1), .src_port = 44444, 332 .set_frag = true, 333 }, 334 }; 335 336 #define MAX_ENCAP_SIZE (MAX_PKT_SIZE + sizeof(struct ipv6hdr)) 337 338 static __u8 pkt_buf[NUM_SCENARIOS][MAX_PKT_SIZE]; 339 static __u32 pkt_len[NUM_SCENARIOS]; 340 static __u8 expected_buf[NUM_SCENARIOS][MAX_ENCAP_SIZE]; 341 static __u32 expected_len[NUM_SCENARIOS]; 342 343 static int lru_inner_fds[BENCH_NR_CPUS]; 344 static int nr_inner_maps; 345 346 static struct ctx { 347 struct xdp_lb_bench *skel; 348 struct bpf_bench_timing timing; 349 int prog_fd; 350 } ctx; 351 352 static struct { 353 int scenario; 354 bool machine_readable; 355 } args = { 356 .scenario = -1, 357 }; 358 359 static __u16 ip_checksum(const void *hdr, int len) 360 { 361 const __u16 *p = hdr; 362 __u32 csum = 0; 363 int i; 364 365 for (i = 0; i < len / 2; i++) 366 csum += p[i]; 367 368 while (csum >> 16) 369 csum = (csum & 0xffff) + (csum >> 16); 370 371 return ~csum; 372 } 373 374 static void htonl_v6(__be32 dst[4], const __u32 src[4]) 375 { 376 int i; 377 378 for (i = 0; i < 4; i++) 379 dst[i] = htonl(src[i]); 380 } 381 382 static void build_flow_key(struct flow_key *fk, const struct test_scenario *sc) 383 { 384 memset(fk, 0, sizeof(*fk)); 385 if (sc->is_v6) { 386 htonl_v6(fk->srcv6, sc->src_addr_v6); 387 htonl_v6(fk->dstv6, sc->vip_addr_v6); 388 } else { 389 fk->src = htonl(sc->src_addr); 390 fk->dst = htonl(sc->vip_addr); 391 } 392 fk->proto = sc->ip_proto; 393 fk->port16[0] = htons(sc->src_port); 394 fk->port16[1] = htons(sc->dst_port); 395 } 396 397 static void build_l4(const struct test_scenario *sc, __u8 *p, __u32 *off) 398 { 399 if (sc->ip_proto == IPPROTO_TCP) { 400 struct tcphdr tcp = {}; 401 402 tcp.source = htons(sc->src_port); 403 tcp.dest = htons(sc->dst_port); 404 tcp.doff = 5; 405 tcp.syn = sc->set_syn ? 1 : 0; 406 tcp.rst = sc->set_rst ? 1 : 0; 407 tcp.window = htons(8192); 408 memcpy(p + *off, &tcp, sizeof(tcp)); 409 *off += sizeof(tcp); 410 } else if (sc->ip_proto == IPPROTO_UDP) { 411 struct udphdr udp = {}; 412 413 udp.source = htons(sc->src_port); 414 udp.dest = htons(sc->dst_port); 415 udp.len = htons(sizeof(udp) + 16); 416 memcpy(p + *off, &udp, sizeof(udp)); 417 *off += sizeof(udp); 418 } 419 } 420 421 static void build_packet(int idx) 422 { 423 const struct test_scenario *sc = &scenarios[idx]; 424 __u8 *p = pkt_buf[idx]; 425 struct ethhdr eth = {}; 426 __u16 proto; 427 __u32 off = 0; 428 429 memcpy(eth.h_dest, lb_mac, ETH_ALEN); 430 memcpy(eth.h_source, client_mac, ETH_ALEN); 431 432 if (sc->eth_proto) 433 proto = sc->eth_proto; 434 else if (sc->is_v6) 435 proto = ETH_P_IPV6; 436 else 437 proto = ETH_P_IP; 438 439 eth.h_proto = htons(proto); 440 memcpy(p, ð, sizeof(eth)); 441 off += sizeof(eth); 442 443 if (proto != ETH_P_IP && proto != ETH_P_IPV6) { 444 memcpy(p + off, "bench___payload!", 16); 445 off += 16; 446 pkt_len[idx] = off; 447 return; 448 } 449 450 if (sc->is_v6) { 451 struct ipv6hdr ip6h = {}; 452 __u32 ip6_off = off; 453 454 ip6h.version = 6; 455 ip6h.nexthdr = sc->set_frag ? 44 : sc->ip_proto; 456 ip6h.hop_limit = 64; 457 htonl_v6((__be32 *)&ip6h.saddr, sc->src_addr_v6); 458 htonl_v6((__be32 *)&ip6h.daddr, sc->vip_addr_v6); 459 off += sizeof(ip6h); 460 461 if (sc->set_frag) { 462 memset(p + off, 0, 8); 463 p[off] = sc->ip_proto; 464 off += 8; 465 } 466 467 build_l4(sc, p, &off); 468 469 memcpy(p + off, "bench___payload!", 16); 470 off += 16; 471 472 ip6h.payload_len = htons(off - ip6_off - sizeof(ip6h)); 473 memcpy(p + ip6_off, &ip6h, sizeof(ip6h)); 474 } else { 475 struct iphdr iph = {}; 476 __u32 ip_off = off; 477 478 iph.version = 4; 479 iph.ihl = sc->set_ip_options ? 6 : 5; 480 iph.ttl = 64; 481 iph.protocol = sc->ip_proto; 482 iph.saddr = htonl(sc->src_addr); 483 iph.daddr = htonl(sc->vip_addr); 484 iph.frag_off = sc->set_frag ? htons(IP_MF) : 0; 485 off += sizeof(iph); 486 487 if (sc->set_ip_options) { 488 /* NOP option padding (4 bytes = 1 word) */ 489 __u32 nop = htonl(0x01010101); 490 491 memcpy(p + off, &nop, sizeof(nop)); 492 off += sizeof(nop); 493 } 494 495 build_l4(sc, p, &off); 496 497 memcpy(p + off, "bench___payload!", 16); 498 off += 16; 499 500 iph.tot_len = htons(off - ip_off); 501 iph.check = ip_checksum(&iph, sizeof(iph)); 502 memcpy(p + ip_off, &iph, sizeof(iph)); 503 } 504 505 pkt_len[idx] = off; 506 } 507 508 static void populate_vip(struct xdp_lb_bench *skel, const struct test_scenario *sc) 509 { 510 struct vip_definition key = {}; 511 struct vip_meta val = {}; 512 int err; 513 514 if (sc->is_v6) 515 htonl_v6(key.vipv6, sc->vip_addr_v6); 516 else 517 key.vip = htonl(sc->vip_addr); 518 key.port = htons(sc->dst_port); 519 key.proto = sc->ip_proto; 520 val.flags = sc->vip_flags; 521 val.vip_num = sc->vip_num; 522 523 err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &val, BPF_ANY); 524 if (err) { 525 fprintf(stderr, "vip_map [%s]: %s\n", sc->name, strerror(errno)); 526 exit(1); 527 } 528 } 529 530 static void create_per_cpu_lru_maps(struct xdp_lb_bench *skel) 531 { 532 int outer_fd = bpf_map__fd(skel->maps.lru_mapping); 533 unsigned int nr_cpus = bpf_num_possible_cpus(); 534 int i, inner_fd, err; 535 __u32 cpu; 536 537 if (nr_cpus > BENCH_NR_CPUS) 538 nr_cpus = BENCH_NR_CPUS; 539 540 for (i = 0; i < (int)nr_cpus; i++) { 541 LIBBPF_OPTS(bpf_map_create_opts, opts); 542 543 inner_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "lru_inner", 544 sizeof(struct flow_key), 545 sizeof(struct real_pos_lru), 546 DEFAULT_LRU_SIZE, &opts); 547 if (inner_fd < 0) { 548 fprintf(stderr, "lru_inner[%d]: %s\n", i, strerror(errno)); 549 exit(1); 550 } 551 552 cpu = i; 553 err = bpf_map_update_elem(outer_fd, &cpu, &inner_fd, BPF_ANY); 554 if (err) { 555 fprintf(stderr, "lru_mapping[%d]: %s\n", i, strerror(errno)); 556 close(inner_fd); 557 exit(1); 558 } 559 560 lru_inner_fds[i] = inner_fd; 561 } 562 563 nr_inner_maps = nr_cpus; 564 } 565 566 static __u64 ktime_get_ns(void) 567 { 568 struct timespec ts; 569 570 clock_gettime(CLOCK_MONOTONIC, &ts); 571 return (__u64)ts.tv_sec * 1000000000ULL + ts.tv_nsec; 572 } 573 574 static void populate_lru(const struct test_scenario *sc, __u32 real_idx) 575 { 576 struct real_pos_lru lru = { .pos = real_idx }; 577 struct flow_key fk; 578 int i, err; 579 580 if (sc->ip_proto == IPPROTO_UDP) 581 lru.atime = ktime_get_ns(); 582 583 build_flow_key(&fk, sc); 584 585 /* Insert into every per-CPU inner LRU so the entry is found 586 * regardless of which CPU runs the BPF program. 587 */ 588 for (i = 0; i < nr_inner_maps; i++) { 589 err = bpf_map_update_elem(lru_inner_fds[i], &fk, &lru, BPF_ANY); 590 if (err) { 591 fprintf(stderr, "lru_inner[%d] [%s]: %s\n", i, sc->name, 592 strerror(errno)); 593 exit(1); 594 } 595 } 596 } 597 598 static void populate_maps(struct xdp_lb_bench *skel) 599 { 600 struct real_definition real_v4 = {}; 601 struct real_definition real_v6 = {}; 602 struct ctl_value cval = {}; 603 __u32 key, real_idx = REAL_INDEX; 604 int ch_fd, err, i; 605 606 if (scenarios[args.scenario].expect_encap) 607 populate_vip(skel, &scenarios[args.scenario]); 608 609 ch_fd = bpf_map__fd(skel->maps.ch_rings); 610 for (i = 0; i < CH_RINGS_SIZE; i++) { 611 __u32 k = i; 612 613 err = bpf_map_update_elem(ch_fd, &k, &real_idx, BPF_ANY); 614 if (err) { 615 fprintf(stderr, "ch_rings[%d]: %s\n", i, strerror(errno)); 616 exit(1); 617 } 618 } 619 620 memcpy(cval.mac, router_mac, ETH_ALEN); 621 key = 0; 622 err = bpf_map_update_elem(bpf_map__fd(skel->maps.ctl_array), &key, &cval, BPF_ANY); 623 if (err) { 624 fprintf(stderr, "ctl_array: %s\n", strerror(errno)); 625 exit(1); 626 } 627 628 key = REAL_INDEX; 629 real_v4.dst = htonl(TNL_DST); 630 htonl_v6(real_v4.dstv6, tnl_dst_v6); 631 err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v4, BPF_ANY); 632 if (err) { 633 fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX, strerror(errno)); 634 exit(1); 635 } 636 637 key = REAL_INDEX_V6; 638 htonl_v6(real_v6.dstv6, tnl_dst_v6); 639 real_v6.flags = F_IPV6; 640 err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v6, BPF_ANY); 641 if (err) { 642 fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX_V6, strerror(errno)); 643 exit(1); 644 } 645 646 create_per_cpu_lru_maps(skel); 647 648 if (scenarios[args.scenario].prepopulate_lru) { 649 const struct test_scenario *sc = &scenarios[args.scenario]; 650 __u32 ridx = sc->encap_v6_outer ? REAL_INDEX_V6 : REAL_INDEX; 651 652 populate_lru(sc, ridx); 653 } 654 655 if (scenarios[args.scenario].expect_encap) { 656 const struct test_scenario *sc = &scenarios[args.scenario]; 657 struct vip_definition miss_vip = {}; 658 659 if (sc->is_v6) 660 htonl_v6(miss_vip.vipv6, sc->vip_addr_v6); 661 else 662 miss_vip.vip = htonl(sc->vip_addr); 663 miss_vip.port = htons(sc->dst_port); 664 miss_vip.proto = sc->ip_proto; 665 666 key = 0; 667 err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_miss_stats), 668 &key, &miss_vip, BPF_ANY); 669 if (err) { 670 fprintf(stderr, "vip_miss_stats: %s\n", strerror(errno)); 671 exit(1); 672 } 673 } 674 } 675 676 static void build_expected_packet(int idx) 677 { 678 const struct test_scenario *sc = &scenarios[idx]; 679 __u8 *p = expected_buf[idx]; 680 struct ethhdr eth = {}; 681 const __u8 *in = pkt_buf[idx]; 682 __u32 in_len = pkt_len[idx]; 683 __u32 off = 0; 684 __u32 inner_len = in_len - sizeof(struct ethhdr); 685 686 if (sc->expected_retval == XDP_DROP) { 687 expected_len[idx] = 0; 688 return; 689 } 690 691 if (sc->expected_retval == XDP_PASS) { 692 memcpy(p, in, in_len); 693 expected_len[idx] = in_len; 694 return; 695 } 696 697 memcpy(eth.h_dest, router_mac, ETH_ALEN); 698 memcpy(eth.h_source, lb_mac, ETH_ALEN); 699 eth.h_proto = htons(sc->encap_v6_outer ? ETH_P_IPV6 : ETH_P_IP); 700 memcpy(p, ð, sizeof(eth)); 701 off += sizeof(eth); 702 703 if (sc->encap_v6_outer) { 704 struct ipv6hdr ip6h = {}; 705 __u8 nexthdr = sc->is_v6 ? IPPROTO_IPV6 : IPPROTO_IPIP; 706 707 ip6h.version = 6; 708 ip6h.nexthdr = nexthdr; 709 ip6h.payload_len = htons(inner_len); 710 ip6h.hop_limit = 64; 711 712 create_encap_ipv6_src(htons(sc->src_port), 713 sc->is_v6 ? htonl(sc->src_addr_v6[0]) 714 : htonl(sc->src_addr), 715 (__be32 *)&ip6h.saddr); 716 htonl_v6((__be32 *)&ip6h.daddr, sc->tunnel_dst_v6); 717 718 memcpy(p + off, &ip6h, sizeof(ip6h)); 719 off += sizeof(ip6h); 720 } else { 721 struct iphdr iph = {}; 722 723 iph.version = 4; 724 iph.ihl = sizeof(iph) >> 2; 725 iph.protocol = IPPROTO_IPIP; 726 iph.tot_len = htons(inner_len + sizeof(iph)); 727 iph.ttl = 64; 728 iph.saddr = create_encap_ipv4_src(htons(sc->src_port), 729 htonl(sc->src_addr)); 730 iph.daddr = htonl(sc->tunnel_dst); 731 iph.check = ip_checksum(&iph, sizeof(iph)); 732 733 memcpy(p + off, &iph, sizeof(iph)); 734 off += sizeof(iph); 735 } 736 737 memcpy(p + off, in + sizeof(struct ethhdr), inner_len); 738 off += inner_len; 739 740 expected_len[idx] = off; 741 } 742 743 static void print_hex_diff(const char *name, const __u8 *got, __u32 got_len, const __u8 *exp, 744 __u32 exp_len) 745 { 746 __u32 max_len = got_len > exp_len ? got_len : exp_len; 747 __u32 i, ndiffs = 0; 748 749 fprintf(stderr, " [%s] got %u bytes, expected %u bytes\n", 750 name, got_len, exp_len); 751 752 for (i = 0; i < max_len && ndiffs < 8; i++) { 753 __u8 g = i < got_len ? got[i] : 0; 754 __u8 e = i < exp_len ? exp[i] : 0; 755 756 if (g != e || i >= got_len || i >= exp_len) { 757 fprintf(stderr, " offset 0x%03x: got 0x%02x expected 0x%02x\n", 758 i, g, e); 759 ndiffs++; 760 } 761 } 762 763 if (ndiffs >= 8 && i < max_len) 764 fprintf(stderr, " ... (more differences)\n"); 765 } 766 767 static void read_stat(int stats_fd, __u32 key, __u64 *v1_out, __u64 *v2_out) 768 { 769 struct lb_stats values[BENCH_NR_CPUS]; 770 unsigned int nr_cpus = bpf_num_possible_cpus(); 771 __u64 v1 = 0, v2 = 0; 772 unsigned int i; 773 774 if (nr_cpus > BENCH_NR_CPUS) 775 nr_cpus = BENCH_NR_CPUS; 776 777 if (bpf_map_lookup_elem(stats_fd, &key, values) == 0) { 778 for (i = 0; i < nr_cpus; i++) { 779 v1 += values[i].v1; 780 v2 += values[i].v2; 781 } 782 } 783 784 *v1_out = v1; 785 *v2_out = v2; 786 } 787 788 static void reset_stats(int stats_fd) 789 { 790 struct lb_stats zeros[BENCH_NR_CPUS]; 791 __u32 key; 792 793 memset(zeros, 0, sizeof(zeros)); 794 for (key = 0; key < STATS_SIZE; key++) 795 bpf_map_update_elem(stats_fd, &key, zeros, BPF_ANY); 796 } 797 798 static bool validate_counters(int idx) 799 { 800 const struct test_scenario *sc = &scenarios[idx]; 801 int stats_fd = bpf_map__fd(ctx.skel->maps.stats); 802 __u64 xdp_tx, xdp_pass, xdp_drop, lru_pkts, lru_misses, tcp_misses; 803 __u64 expected_misses; 804 __u64 dummy; 805 /* 806 * BENCH_BPF_LOOP runs batch_iters timed + 1 untimed iteration. 807 * Each iteration calls process_packet -> count_action, so all 808 * counters are incremented (batch_iters + 1) times. 809 */ 810 __u64 n = ctx.timing.batch_iters + 1; 811 bool pass = true; 812 813 read_stat(stats_fd, STATS_XDP_TX, &xdp_tx, &dummy); 814 read_stat(stats_fd, STATS_XDP_PASS, &xdp_pass, &dummy); 815 read_stat(stats_fd, STATS_XDP_DROP, &xdp_drop, &dummy); 816 read_stat(stats_fd, STATS_LRU, &lru_pkts, &lru_misses); 817 read_stat(stats_fd, STATS_LRU_MISS, &tcp_misses, &dummy); 818 819 if (sc->expected_retval == XDP_TX && xdp_tx != n) { 820 fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_TX=%llu, expected %llu\n", sc->name, 821 (unsigned long long)xdp_tx, (unsigned long long)n); 822 pass = false; 823 } 824 if (sc->expected_retval == XDP_PASS && xdp_pass != n) { 825 fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_PASS=%llu, expected %llu\n", 826 sc->name, (unsigned long long)xdp_pass, (unsigned long long)n); 827 pass = false; 828 } 829 if (sc->expected_retval == XDP_DROP && xdp_drop != n) { 830 fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_DROP=%llu, expected %llu\n", 831 sc->name, (unsigned long long)xdp_drop, (unsigned long long)n); 832 pass = false; 833 } 834 835 if (!sc->expect_encap) 836 goto out; 837 838 if (lru_pkts != n) { 839 fprintf(stderr, " [%s] COUNTER FAIL: STATS_LRU.v1=%llu, expected %llu\n", 840 sc->name, (unsigned long long)lru_pkts, (unsigned long long)n); 841 pass = false; 842 } 843 844 switch (sc->lru_miss) { 845 case LRU_MISS_NONE: 846 expected_misses = 0; 847 break; 848 case LRU_MISS_ALL: 849 expected_misses = n; 850 break; 851 case LRU_MISS_FIRST: 852 expected_misses = 1; 853 break; 854 default: 855 /* LRU_MISS_AUTO: compute from scenario flags */ 856 if (sc->prepopulate_lru && !sc->set_syn) 857 expected_misses = 0; 858 else if (sc->set_syn || sc->set_rst || 859 (sc->vip_flags & F_LRU_BYPASS)) 860 expected_misses = n; 861 else if (sc->cold_lru) 862 expected_misses = 1; 863 else 864 expected_misses = n; 865 break; 866 } 867 868 if (lru_misses != expected_misses) { 869 fprintf(stderr, " [%s] COUNTER FAIL: LRU misses=%llu, expected %llu\n", 870 sc->name, (unsigned long long)lru_misses, 871 (unsigned long long)expected_misses); 872 pass = false; 873 } 874 875 if (sc->ip_proto == IPPROTO_TCP && lru_misses > 0) { 876 if (tcp_misses != lru_misses) { 877 fprintf(stderr, " [%s] COUNTER FAIL: TCP LRU misses=%llu, expected %llu\n", 878 sc->name, (unsigned long long)tcp_misses, 879 (unsigned long long)lru_misses); 880 pass = false; 881 } 882 } 883 884 out: 885 reset_stats(stats_fd); 886 return pass; 887 } 888 889 static const char *xdp_action_str(int action) 890 { 891 switch (action) { 892 case XDP_DROP: return "XDP_DROP"; 893 case XDP_PASS: return "XDP_PASS"; 894 case XDP_TX: return "XDP_TX"; 895 default: return "UNKNOWN"; 896 } 897 } 898 899 static bool validate_scenario(int idx) 900 { 901 LIBBPF_OPTS(bpf_test_run_opts, topts); 902 const struct test_scenario *sc = &scenarios[idx]; 903 __u8 out[MAX_ENCAP_SIZE]; 904 int err; 905 906 topts.data_in = pkt_buf[idx]; 907 topts.data_size_in = pkt_len[idx]; 908 topts.data_out = out; 909 topts.data_size_out = sizeof(out); 910 topts.repeat = 1; 911 912 err = bpf_prog_test_run_opts(ctx.prog_fd, &topts); 913 if (err) { 914 fprintf(stderr, " [%s] FAIL: test_run: %s\n", sc->name, strerror(errno)); 915 return false; 916 } 917 918 if ((int)topts.retval != sc->expected_retval) { 919 fprintf(stderr, " [%s] FAIL: retval %s, expected %s\n", sc->name, 920 xdp_action_str(topts.retval), xdp_action_str(sc->expected_retval)); 921 return false; 922 } 923 924 /* 925 * Compare output packet when it's deterministic. 926 * Skip for XDP_DROP (no output) and cold_lru (source IP poisoned). 927 */ 928 if (sc->expected_retval != XDP_DROP && !sc->cold_lru) { 929 if (topts.data_size_out != expected_len[idx] || 930 memcmp(out, expected_buf[idx], expected_len[idx]) != 0) { 931 fprintf(stderr, " [%s] FAIL: output packet mismatch\n", sc->name); 932 print_hex_diff(sc->name, out, topts.data_size_out, expected_buf[idx], 933 expected_len[idx]); 934 return false; 935 } 936 } 937 938 if (!validate_counters(idx)) 939 return false; 940 return true; 941 } 942 943 static int find_scenario(const char *name) 944 { 945 int i; 946 947 for (i = 0; i < NUM_SCENARIOS; i++) { 948 if (strcmp(scenarios[i].name, name) == 0) 949 return i; 950 } 951 return -1; 952 } 953 954 static void xdp_lb_validate(void) 955 { 956 if (env.consumer_cnt != 0) { 957 fprintf(stderr, "benchmark doesn't support consumers\n"); 958 exit(1); 959 } 960 if (bpf_num_possible_cpus() > BENCH_NR_CPUS) { 961 fprintf(stderr, "too many CPUs (%d > %d), increase BENCH_NR_CPUS\n", 962 bpf_num_possible_cpus(), BENCH_NR_CPUS); 963 exit(1); 964 } 965 } 966 967 static void xdp_lb_run_once(void *unused __always_unused) 968 { 969 int idx = args.scenario; 970 971 LIBBPF_OPTS(bpf_test_run_opts, topts, 972 .data_in = pkt_buf[idx], 973 .data_size_in = pkt_len[idx], 974 .repeat = 1, 975 ); 976 977 bpf_prog_test_run_opts(ctx.prog_fd, &topts); 978 } 979 980 static void xdp_lb_setup(void) 981 { 982 struct xdp_lb_bench *skel; 983 int err; 984 985 if (args.scenario < 0) { 986 fprintf(stderr, "--scenario is required. Use --list-scenarios to see options.\n"); 987 exit(1); 988 } 989 990 setup_libbpf(); 991 992 skel = xdp_lb_bench__open(); 993 if (!skel) { 994 fprintf(stderr, "failed to open skeleton\n"); 995 exit(1); 996 } 997 998 err = xdp_lb_bench__load(skel); 999 if (err) { 1000 fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err)); 1001 xdp_lb_bench__destroy(skel); 1002 exit(1); 1003 } 1004 1005 ctx.skel = skel; 1006 ctx.prog_fd = bpf_program__fd(skel->progs.xdp_lb_bench); 1007 1008 build_packet(args.scenario); 1009 build_expected_packet(args.scenario); 1010 1011 populate_maps(skel); 1012 1013 BENCH_TIMING_INIT(&ctx.timing, skel, 0); 1014 ctx.timing.machine_readable = args.machine_readable; 1015 1016 if (scenarios[args.scenario].fixed_batch_iters) { 1017 ctx.timing.batch_iters = scenarios[args.scenario].fixed_batch_iters; 1018 skel->bss->batch_iters = ctx.timing.batch_iters; 1019 } else { 1020 bpf_bench_calibrate(&ctx.timing, xdp_lb_run_once, NULL); 1021 } 1022 1023 env.duration_sec = 600; 1024 1025 /* 1026 * Enable cold_lru before validation so LRU miss counters are 1027 * correct. Seed the LRU with one run so the original flow is 1028 * present; validation then sees exactly 1 miss (the poisoned 1029 * flow) regardless of whether calibration ran. 1030 */ 1031 if (scenarios[args.scenario].cold_lru) { 1032 skel->bss->cold_lru = 1; 1033 xdp_lb_run_once(NULL); 1034 } 1035 1036 reset_stats(bpf_map__fd(skel->maps.stats)); 1037 1038 if (!validate_scenario(args.scenario)) { 1039 fprintf(stderr, "Validation FAILED - aborting benchmark\n"); 1040 exit(1); 1041 } 1042 1043 if (scenarios[args.scenario].flow_mask) 1044 skel->bss->flow_mask = scenarios[args.scenario].flow_mask; 1045 } 1046 1047 static void *xdp_lb_producer(void *input) 1048 { 1049 while (true) 1050 xdp_lb_run_once(NULL); 1051 1052 return NULL; 1053 } 1054 1055 static void xdp_lb_measure(struct bench_res *res) 1056 { 1057 bpf_bench_timing_measure(&ctx.timing, res); 1058 } 1059 1060 static void xdp_lb_report_final(struct bench_res res[], int res_cnt) 1061 { 1062 bpf_bench_timing_report(&ctx.timing, scenarios[args.scenario].name, 1063 scenarios[args.scenario].description); 1064 } 1065 1066 enum { 1067 ARG_SCENARIO = 9001, 1068 ARG_LIST_SCENARIOS = 9002, 1069 ARG_MACHINE_READABLE = 9003, 1070 }; 1071 1072 static const struct argp_option opts[] = { 1073 { "scenario", ARG_SCENARIO, "NAME", 0, 1074 "Scenario to benchmark (required)" }, 1075 { "list-scenarios", ARG_LIST_SCENARIOS, NULL, 0, 1076 "List available scenarios and exit" }, 1077 { "machine-readable", ARG_MACHINE_READABLE, NULL, 0, 1078 "Print only a machine-readable RESULT line" }, 1079 {}, 1080 }; 1081 1082 static error_t parse_arg(int key, char *arg, struct argp_state *state) 1083 { 1084 int i; 1085 1086 switch (key) { 1087 case ARG_SCENARIO: 1088 args.scenario = find_scenario(arg); 1089 if (args.scenario < 0) { 1090 fprintf(stderr, "unknown scenario: '%s'\n", arg); 1091 fprintf(stderr, "use --list-scenarios to see options\n"); 1092 argp_usage(state); 1093 } 1094 break; 1095 case ARG_LIST_SCENARIOS: 1096 printf("Available scenarios:\n"); 1097 for (i = 0; i < NUM_SCENARIOS; i++) 1098 printf(" %-20s %s\n", scenarios[i].name, scenarios[i].description); 1099 exit(0); 1100 case ARG_MACHINE_READABLE: 1101 args.machine_readable = true; 1102 env.quiet = true; 1103 break; 1104 default: 1105 return ARGP_ERR_UNKNOWN; 1106 } 1107 1108 return 0; 1109 } 1110 1111 const struct argp bench_xdp_lb_argp = { 1112 .options = opts, 1113 .parser = parse_arg, 1114 }; 1115 1116 const struct bench bench_xdp_lb = { 1117 .name = "xdp-lb", 1118 .argp = &bench_xdp_lb_argp, 1119 .validate = xdp_lb_validate, 1120 .setup = xdp_lb_setup, 1121 .producer_thread = xdp_lb_producer, 1122 .measure = xdp_lb_measure, 1123 .report_final = xdp_lb_report_final, 1124 }; 1125