1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Check if we can migrate child sockets. 4 * 5 * 1. call listen() for 4 server sockets. 6 * 2. call connect() for 25 client sockets. 7 * 3. call listen() for 1 server socket. (migration target) 8 * 4. update a map to migrate all child sockets 9 * to the last server socket (migrate_map[cookie] = 4) 10 * 5. for TCP_ESTABLISHED and TCP_SYN_RECV cases, verify via epoll 11 * that the last server socket is not ready before migration. 12 * 6. call shutdown() for first 4 server sockets 13 * and migrate the requests in the accept queue 14 * to the last server socket. 15 * 7. for TCP_ESTABLISHED and TCP_SYN_RECV cases, verify via epoll 16 * that the last server socket is ready after migration. 17 * 8. call listen() for the second server socket. 18 * 9. call shutdown() for the last server 19 * and migrate the requests in the accept queue 20 * to the second server socket. 21 * 10. call listen() for the last server. 22 * 11. call shutdown() for the second server 23 * and migrate the requests in the accept queue 24 * to the last server socket. 25 * 12. call accept() for the last server socket. 26 * 27 * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> 28 */ 29 30 #include <bpf/bpf.h> 31 #include <bpf/libbpf.h> 32 #include <sys/epoll.h> 33 34 #include "test_progs.h" 35 #include "test_migrate_reuseport.skel.h" 36 #include "network_helpers.h" 37 38 #ifndef TCP_FASTOPEN_CONNECT 39 #define TCP_FASTOPEN_CONNECT 30 40 #endif 41 42 #define IFINDEX_LO 1 43 44 #define NR_SERVERS 5 45 #define NR_CLIENTS (NR_SERVERS * 5) 46 #define MIGRATED_TO (NR_SERVERS - 1) 47 48 /* fastopenq->max_qlen and sk->sk_max_ack_backlog */ 49 #define QLEN (NR_CLIENTS * 5) 50 51 #define MSG "Hello World\0" 52 #define MSGLEN 12 53 54 static struct migrate_reuseport_test_case { 55 const char *name; 56 __s64 servers[NR_SERVERS]; 57 __s64 clients[NR_CLIENTS]; 58 struct sockaddr_storage addr; 59 socklen_t addrlen; 60 int family; 61 int state; 62 bool drop_ack; 63 bool expire_synack_timer; 64 bool fastopen; 65 struct bpf_link *link; 66 } test_cases[] = { 67 { 68 .name = "IPv4 TCP_ESTABLISHED inet_csk_listen_stop", 69 .family = AF_INET, 70 .state = BPF_TCP_ESTABLISHED, 71 .drop_ack = false, 72 .expire_synack_timer = false, 73 .fastopen = false, 74 }, 75 { 76 .name = "IPv4 TCP_SYN_RECV inet_csk_listen_stop", 77 .family = AF_INET, 78 .state = BPF_TCP_SYN_RECV, 79 .drop_ack = true, 80 .expire_synack_timer = false, 81 .fastopen = true, 82 }, 83 { 84 .name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler", 85 .family = AF_INET, 86 .state = BPF_TCP_NEW_SYN_RECV, 87 .drop_ack = true, 88 .expire_synack_timer = true, 89 .fastopen = false, 90 }, 91 { 92 .name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", 93 .family = AF_INET, 94 .state = BPF_TCP_NEW_SYN_RECV, 95 .drop_ack = true, 96 .expire_synack_timer = false, 97 .fastopen = false, 98 }, 99 { 100 .name = "IPv6 TCP_ESTABLISHED inet_csk_listen_stop", 101 .family = AF_INET6, 102 .state = BPF_TCP_ESTABLISHED, 103 .drop_ack = false, 104 .expire_synack_timer = false, 105 .fastopen = false, 106 }, 107 { 108 .name = "IPv6 TCP_SYN_RECV inet_csk_listen_stop", 109 .family = AF_INET6, 110 .state = BPF_TCP_SYN_RECV, 111 .drop_ack = true, 112 .expire_synack_timer = false, 113 .fastopen = true, 114 }, 115 { 116 .name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler", 117 .family = AF_INET6, 118 .state = BPF_TCP_NEW_SYN_RECV, 119 .drop_ack = true, 120 .expire_synack_timer = true, 121 .fastopen = false, 122 }, 123 { 124 .name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", 125 .family = AF_INET6, 126 .state = BPF_TCP_NEW_SYN_RECV, 127 .drop_ack = true, 128 .expire_synack_timer = false, 129 .fastopen = false, 130 } 131 }; 132 133 static void init_fds(__s64 fds[], int len) 134 { 135 int i; 136 137 for (i = 0; i < len; i++) 138 fds[i] = -1; 139 } 140 141 static void close_fds(__s64 fds[], int len) 142 { 143 int i; 144 145 for (i = 0; i < len; i++) { 146 if (fds[i] != -1) { 147 close(fds[i]); 148 fds[i] = -1; 149 } 150 } 151 } 152 153 static int setup_fastopen(char *buf, int size, int *saved_len, bool restore) 154 { 155 int err = 0, fd, len; 156 157 fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR); 158 if (!ASSERT_NEQ(fd, -1, "open")) 159 return -1; 160 161 if (restore) { 162 len = write(fd, buf, *saved_len); 163 if (!ASSERT_EQ(len, *saved_len, "write - restore")) 164 err = -1; 165 } else { 166 *saved_len = read(fd, buf, size); 167 if (!ASSERT_GE(*saved_len, 1, "read")) { 168 err = -1; 169 goto close; 170 } 171 172 err = lseek(fd, 0, SEEK_SET); 173 if (!ASSERT_OK(err, "lseek")) 174 goto close; 175 176 /* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE | 177 * TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD) 178 */ 179 len = write(fd, "519", 3); 180 if (!ASSERT_EQ(len, 3, "write - setup")) 181 err = -1; 182 } 183 184 close: 185 close(fd); 186 187 return err; 188 } 189 190 static int drop_ack(struct migrate_reuseport_test_case *test_case, 191 struct test_migrate_reuseport *skel) 192 { 193 if (test_case->family == AF_INET) 194 skel->bss->server_port = ((struct sockaddr_in *) 195 &test_case->addr)->sin_port; 196 else 197 skel->bss->server_port = ((struct sockaddr_in6 *) 198 &test_case->addr)->sin6_port; 199 200 test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack, 201 IFINDEX_LO); 202 if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp")) 203 return -1; 204 205 return 0; 206 } 207 208 static int pass_ack(struct migrate_reuseport_test_case *test_case) 209 { 210 int err; 211 212 err = bpf_link__destroy(test_case->link); 213 if (!ASSERT_OK(err, "bpf_link__destroy")) 214 return -1; 215 216 test_case->link = NULL; 217 218 return 0; 219 } 220 221 static int start_servers(struct migrate_reuseport_test_case *test_case, 222 struct test_migrate_reuseport *skel) 223 { 224 int i, err, prog_fd, reuseport = 1, qlen = QLEN; 225 226 prog_fd = bpf_program__fd(skel->progs.migrate_reuseport); 227 228 make_sockaddr(test_case->family, 229 test_case->family == AF_INET ? "127.0.0.1" : "::1", 0, 230 &test_case->addr, &test_case->addrlen); 231 232 for (i = 0; i < NR_SERVERS; i++) { 233 test_case->servers[i] = socket(test_case->family, SOCK_STREAM, 234 IPPROTO_TCP); 235 if (!ASSERT_NEQ(test_case->servers[i], -1, "socket")) 236 return -1; 237 238 err = setsockopt(test_case->servers[i], SOL_SOCKET, 239 SO_REUSEPORT, &reuseport, sizeof(reuseport)); 240 if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT")) 241 return -1; 242 243 err = bind(test_case->servers[i], 244 (struct sockaddr *)&test_case->addr, 245 test_case->addrlen); 246 if (!ASSERT_OK(err, "bind")) 247 return -1; 248 249 if (i == 0) { 250 err = setsockopt(test_case->servers[i], SOL_SOCKET, 251 SO_ATTACH_REUSEPORT_EBPF, 252 &prog_fd, sizeof(prog_fd)); 253 if (!ASSERT_OK(err, 254 "setsockopt - SO_ATTACH_REUSEPORT_EBPF")) 255 return -1; 256 257 err = getsockname(test_case->servers[i], 258 (struct sockaddr *)&test_case->addr, 259 &test_case->addrlen); 260 if (!ASSERT_OK(err, "getsockname")) 261 return -1; 262 } 263 264 if (test_case->fastopen) { 265 err = setsockopt(test_case->servers[i], 266 SOL_TCP, TCP_FASTOPEN, 267 &qlen, sizeof(qlen)); 268 if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN")) 269 return -1; 270 } 271 272 /* All requests will be tied to the first four listeners */ 273 if (i != MIGRATED_TO) { 274 err = listen(test_case->servers[i], qlen); 275 if (!ASSERT_OK(err, "listen")) 276 return -1; 277 } 278 } 279 280 return 0; 281 } 282 283 static int start_clients(struct migrate_reuseport_test_case *test_case) 284 { 285 char buf[MSGLEN] = MSG; 286 int i, err; 287 288 for (i = 0; i < NR_CLIENTS; i++) { 289 test_case->clients[i] = socket(test_case->family, SOCK_STREAM, 290 IPPROTO_TCP); 291 if (!ASSERT_NEQ(test_case->clients[i], -1, "socket")) 292 return -1; 293 294 /* The attached XDP program drops only the final ACK, so 295 * clients will transition to TCP_ESTABLISHED immediately. 296 */ 297 err = settimeo(test_case->clients[i], 100); 298 if (!ASSERT_OK(err, "settimeo")) 299 return -1; 300 301 if (test_case->fastopen) { 302 int fastopen = 1; 303 304 err = setsockopt(test_case->clients[i], IPPROTO_TCP, 305 TCP_FASTOPEN_CONNECT, &fastopen, 306 sizeof(fastopen)); 307 if (!ASSERT_OK(err, 308 "setsockopt - TCP_FASTOPEN_CONNECT")) 309 return -1; 310 } 311 312 err = connect(test_case->clients[i], 313 (struct sockaddr *)&test_case->addr, 314 test_case->addrlen); 315 if (!ASSERT_OK(err, "connect")) 316 return -1; 317 318 err = write(test_case->clients[i], buf, MSGLEN); 319 if (!ASSERT_EQ(err, MSGLEN, "write")) 320 return -1; 321 } 322 323 return 0; 324 } 325 326 static int update_maps(struct migrate_reuseport_test_case *test_case, 327 struct test_migrate_reuseport *skel) 328 { 329 int i, err, migrated_to = MIGRATED_TO; 330 int reuseport_map_fd, migrate_map_fd; 331 __u64 value; 332 333 reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map); 334 migrate_map_fd = bpf_map__fd(skel->maps.migrate_map); 335 336 for (i = 0; i < NR_SERVERS; i++) { 337 value = (__u64)test_case->servers[i]; 338 err = bpf_map_update_elem(reuseport_map_fd, &i, &value, 339 BPF_NOEXIST); 340 if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map")) 341 return -1; 342 343 err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value); 344 if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map")) 345 return -1; 346 347 err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to, 348 BPF_NOEXIST); 349 if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map")) 350 return -1; 351 } 352 353 return 0; 354 } 355 356 static int migrate_dance(struct migrate_reuseport_test_case *test_case) 357 { 358 struct epoll_event ev = { 359 .events = EPOLLIN, 360 }; 361 int epoll = -1, nfds; 362 int i, err; 363 364 if (test_case->state != BPF_TCP_NEW_SYN_RECV) { 365 epoll = epoll_create1(0); 366 if (!ASSERT_NEQ(epoll, -1, "epoll_create1")) 367 return -1; 368 369 ev.data.fd = test_case->servers[MIGRATED_TO]; 370 if (!ASSERT_OK(epoll_ctl(epoll, EPOLL_CTL_ADD, 371 test_case->servers[MIGRATED_TO], &ev), 372 "epoll_ctl")) 373 goto close_epoll; 374 375 nfds = epoll_wait(epoll, &ev, 1, 0); 376 if (!ASSERT_EQ(nfds, 0, "epoll_wait 1")) 377 goto close_epoll; 378 } 379 380 /* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests 381 * to the last listener based on eBPF. 382 */ 383 for (i = 0; i < MIGRATED_TO; i++) { 384 err = shutdown(test_case->servers[i], SHUT_RDWR); 385 if (!ASSERT_OK(err, "shutdown")) 386 goto close_epoll; 387 } 388 389 /* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */ 390 if (test_case->state == BPF_TCP_NEW_SYN_RECV) 391 return 0; 392 393 nfds = epoll_wait(epoll, &ev, 1, 0); 394 if (!ASSERT_EQ(nfds, 1, "epoll_wait 2")) { 395 close_epoll: 396 if (epoll >= 0) 397 close(epoll); 398 return -1; 399 } 400 401 close(epoll); 402 403 /* Note that we use the second listener instead of the 404 * first one here. 405 * 406 * The fist listener is bind()ed with port 0 and, 407 * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so 408 * calling listen() again will bind() the first listener 409 * on a new ephemeral port and detach it from the existing 410 * reuseport group. (See: __inet_bind(), tcp_set_state()) 411 * 412 * OTOH, the second one is bind()ed with a specific port, 413 * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will 414 * resurrect the listener on the existing reuseport group. 415 */ 416 err = listen(test_case->servers[1], QLEN); 417 if (!ASSERT_OK(err, "listen")) 418 return -1; 419 420 /* Migrate from the last listener to the second one. 421 * 422 * All listeners were detached out of the reuseport_map, 423 * so migration will be done by kernel random pick from here. 424 */ 425 err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR); 426 if (!ASSERT_OK(err, "shutdown")) 427 return -1; 428 429 /* Back to the existing reuseport group */ 430 err = listen(test_case->servers[MIGRATED_TO], QLEN); 431 if (!ASSERT_OK(err, "listen")) 432 return -1; 433 434 /* Migrate back to the last one from the second one */ 435 err = shutdown(test_case->servers[1], SHUT_RDWR); 436 if (!ASSERT_OK(err, "shutdown")) 437 return -1; 438 439 return 0; 440 } 441 442 static void count_requests(struct migrate_reuseport_test_case *test_case, 443 struct test_migrate_reuseport *skel) 444 { 445 struct sockaddr_storage addr; 446 socklen_t len = sizeof(addr); 447 int err, cnt = 0, client; 448 char buf[MSGLEN]; 449 450 err = settimeo(test_case->servers[MIGRATED_TO], 4000); 451 if (!ASSERT_OK(err, "settimeo")) 452 goto out; 453 454 for (; cnt < NR_CLIENTS; cnt++) { 455 client = accept(test_case->servers[MIGRATED_TO], 456 (struct sockaddr *)&addr, &len); 457 if (!ASSERT_NEQ(client, -1, "accept")) 458 goto out; 459 460 memset(buf, 0, MSGLEN); 461 read(client, &buf, MSGLEN); 462 close(client); 463 464 if (!ASSERT_STREQ(buf, MSG, "read")) 465 goto out; 466 } 467 468 out: 469 ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace"); 470 471 switch (test_case->state) { 472 case BPF_TCP_ESTABLISHED: 473 cnt = skel->bss->migrated_at_close; 474 break; 475 case BPF_TCP_SYN_RECV: 476 cnt = skel->bss->migrated_at_close_fastopen; 477 break; 478 case BPF_TCP_NEW_SYN_RECV: 479 if (test_case->expire_synack_timer) 480 cnt = skel->bss->migrated_at_send_synack; 481 else 482 cnt = skel->bss->migrated_at_recv_ack; 483 break; 484 default: 485 cnt = 0; 486 } 487 488 ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog"); 489 } 490 491 static void run_test(struct migrate_reuseport_test_case *test_case, 492 struct test_migrate_reuseport *skel) 493 { 494 int err, saved_len; 495 char buf[16]; 496 497 skel->bss->migrated_at_close = 0; 498 skel->bss->migrated_at_close_fastopen = 0; 499 skel->bss->migrated_at_send_synack = 0; 500 skel->bss->migrated_at_recv_ack = 0; 501 502 init_fds(test_case->servers, NR_SERVERS); 503 init_fds(test_case->clients, NR_CLIENTS); 504 505 if (test_case->fastopen) { 506 memset(buf, 0, sizeof(buf)); 507 508 err = setup_fastopen(buf, sizeof(buf), &saved_len, false); 509 if (!ASSERT_OK(err, "setup_fastopen - setup")) 510 return; 511 } 512 513 err = start_servers(test_case, skel); 514 if (!ASSERT_OK(err, "start_servers")) 515 goto close_servers; 516 517 if (test_case->drop_ack) { 518 /* Drop the final ACK of the 3-way handshake and stick the 519 * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV. 520 */ 521 err = drop_ack(test_case, skel); 522 if (!ASSERT_OK(err, "drop_ack")) 523 goto close_servers; 524 } 525 526 /* Tie requests to the first four listeners */ 527 err = start_clients(test_case); 528 if (!ASSERT_OK(err, "start_clients")) 529 goto close_clients; 530 531 err = listen(test_case->servers[MIGRATED_TO], QLEN); 532 if (!ASSERT_OK(err, "listen")) 533 goto close_clients; 534 535 err = update_maps(test_case, skel); 536 if (!ASSERT_OK(err, "fill_maps")) 537 goto close_clients; 538 539 /* Migrate the requests in the accept queue only. 540 * TCP_NEW_SYN_RECV requests are not migrated at this point. 541 */ 542 err = migrate_dance(test_case); 543 if (!ASSERT_OK(err, "migrate_dance")) 544 goto close_clients; 545 546 if (test_case->expire_synack_timer) { 547 /* Wait for SYN+ACK timers to expire so that 548 * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests. 549 */ 550 sleep(1); 551 } 552 553 if (test_case->link) { 554 /* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */ 555 err = pass_ack(test_case); 556 if (!ASSERT_OK(err, "pass_ack")) 557 goto close_clients; 558 } 559 560 count_requests(test_case, skel); 561 562 close_clients: 563 close_fds(test_case->clients, NR_CLIENTS); 564 565 if (test_case->link) { 566 err = pass_ack(test_case); 567 ASSERT_OK(err, "pass_ack - clean up"); 568 } 569 570 close_servers: 571 close_fds(test_case->servers, NR_SERVERS); 572 573 if (test_case->fastopen) { 574 err = setup_fastopen(buf, sizeof(buf), &saved_len, true); 575 ASSERT_OK(err, "setup_fastopen - restore"); 576 } 577 } 578 579 void serial_test_migrate_reuseport(void) 580 { 581 struct test_migrate_reuseport *skel; 582 int i; 583 584 skel = test_migrate_reuseport__open_and_load(); 585 if (!ASSERT_OK_PTR(skel, "open_and_load")) 586 return; 587 588 for (i = 0; i < ARRAY_SIZE(test_cases); i++) { 589 test__start_subtest(test_cases[i].name); 590 run_test(&test_cases[i], skel); 591 } 592 593 test_migrate_reuseport__destroy(skel); 594 } 595