1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ 3 4 #include "vmlinux.h" 5 6 #include <bpf/bpf_helpers.h> 7 #include <bpf/bpf_endian.h> 8 #include <asm/errno.h> 9 10 #include "bpf_compiler.h" 11 12 #define TC_ACT_OK 0 13 #define TC_ACT_SHOT 2 14 15 #define NSEC_PER_SEC 1000000000L 16 17 #define ETH_ALEN 6 18 #define ETH_P_IP 0x0800 19 #define ETH_P_IPV6 0x86DD 20 21 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) 22 23 #define IP_DF 0x4000 24 #define IP_MF 0x2000 25 #define IP_OFFSET 0x1fff 26 27 #define NEXTHDR_TCP 6 28 29 #define TCPOPT_NOP 1 30 #define TCPOPT_EOL 0 31 #define TCPOPT_MSS 2 32 #define TCPOPT_WINDOW 3 33 #define TCPOPT_SACK_PERM 4 34 #define TCPOPT_TIMESTAMP 8 35 36 #define TCPOLEN_MSS 4 37 #define TCPOLEN_WINDOW 3 38 #define TCPOLEN_SACK_PERM 2 39 #define TCPOLEN_TIMESTAMP 10 40 41 #define TCP_TS_HZ 1000 42 #define TS_OPT_WSCALE_MASK 0xf 43 #define TS_OPT_SACK (1 << 4) 44 #define TS_OPT_ECN (1 << 5) 45 #define TSBITS 6 46 #define TSMASK (((__u32)1 << TSBITS) - 1) 47 #define TCP_MAX_WSCALE 14U 48 49 #define IPV4_MAXLEN 60 50 #define TCP_MAXLEN 60 51 52 #define DEFAULT_MSS4 1460 53 #define DEFAULT_MSS6 1440 54 #define DEFAULT_WSCALE 7 55 #define DEFAULT_TTL 64 56 #define MAX_ALLOWED_PORTS 8 57 58 #define MAX_PACKET_OFF 0xffff 59 60 #define swap(a, b) \ 61 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 62 63 #define __get_unaligned_t(type, ptr) ({ \ 64 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ 65 __pptr->x; \ 66 }) 67 68 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) 69 70 struct { 71 __uint(type, BPF_MAP_TYPE_ARRAY); 72 __type(key, __u32); 73 __type(value, __u64); 74 __uint(max_entries, 2); 75 } values SEC(".maps"); 76 77 struct { 78 __uint(type, BPF_MAP_TYPE_ARRAY); 79 __type(key, __u32); 80 __type(value, __u16); 81 __uint(max_entries, MAX_ALLOWED_PORTS); 82 } allowed_ports SEC(".maps"); 83 84 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in 85 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. 86 */ 87 88 struct bpf_ct_opts___local { 89 s32 netns_id; 90 s32 error; 91 u8 l4proto; 92 u8 dir; 93 u8 reserved[2]; 94 } __attribute__((preserve_access_index)); 95 96 #define BPF_F_CURRENT_NETNS (-1) 97 98 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, 99 struct bpf_sock_tuple *bpf_tuple, 100 __u32 len_tuple, 101 struct bpf_ct_opts___local *opts, 102 __u32 len_opts) __ksym; 103 104 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, 105 struct bpf_sock_tuple *bpf_tuple, 106 u32 len_tuple, 107 struct bpf_ct_opts___local *opts, 108 u32 len_opts) __ksym; 109 110 extern void bpf_ct_release(struct nf_conn *ct) __ksym; 111 112 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) 113 { 114 __u8 tmp[ETH_ALEN]; 115 116 __builtin_memcpy(tmp, a, ETH_ALEN); 117 __builtin_memcpy(a, b, ETH_ALEN); 118 __builtin_memcpy(b, tmp, ETH_ALEN); 119 } 120 121 static __always_inline __u16 csum_fold(__u32 csum) 122 { 123 csum = (csum & 0xffff) + (csum >> 16); 124 csum = (csum & 0xffff) + (csum >> 16); 125 return (__u16)~csum; 126 } 127 128 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, 129 __u32 len, __u8 proto, 130 __u32 csum) 131 { 132 __u64 s = csum; 133 134 s += (__u32)saddr; 135 s += (__u32)daddr; 136 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 137 s += proto + len; 138 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 139 s += (proto + len) << 8; 140 #else 141 #error Unknown endian 142 #endif 143 s = (s & 0xffffffff) + (s >> 32); 144 s = (s & 0xffffffff) + (s >> 32); 145 146 return csum_fold((__u32)s); 147 } 148 149 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, 150 const struct in6_addr *daddr, 151 __u32 len, __u8 proto, __u32 csum) 152 { 153 __u64 sum = csum; 154 int i; 155 156 __pragma_loop_unroll 157 for (i = 0; i < 4; i++) 158 sum += (__u32)saddr->in6_u.u6_addr32[i]; 159 160 __pragma_loop_unroll 161 for (i = 0; i < 4; i++) 162 sum += (__u32)daddr->in6_u.u6_addr32[i]; 163 164 /* Don't combine additions to avoid 32-bit overflow. */ 165 sum += bpf_htonl(len); 166 sum += bpf_htonl(proto); 167 168 sum = (sum & 0xffffffff) + (sum >> 32); 169 sum = (sum & 0xffffffff) + (sum >> 32); 170 171 return csum_fold((__u32)sum); 172 } 173 174 static __always_inline __u64 tcp_clock_ns(void) 175 { 176 return bpf_ktime_get_ns(); 177 } 178 179 static __always_inline __u32 tcp_ns_to_ts(__u64 ns) 180 { 181 return ns / (NSEC_PER_SEC / TCP_TS_HZ); 182 } 183 184 static __always_inline __u32 tcp_clock_ms(void) 185 { 186 return tcp_ns_to_ts(tcp_clock_ns()); 187 } 188 189 struct tcpopt_context { 190 void *data; 191 void *data_end; 192 __be32 *tsecr; 193 __u8 wscale; 194 bool option_timestamp; 195 bool option_sack; 196 __u32 off; 197 }; 198 199 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) 200 { 201 __u64 off = ctx->off; 202 __u8 *data; 203 204 /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ 205 if (off > MAX_PACKET_OFF - sz) 206 return NULL; 207 208 data = ctx->data + off; 209 barrier_var(data); 210 if (data + sz >= ctx->data_end) 211 return NULL; 212 213 ctx->off += sz; 214 return data; 215 } 216 217 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 218 { 219 __u8 *opcode, *opsize, *wscale, *tsecr; 220 __u32 off = ctx->off; 221 222 opcode = next(ctx, 1); 223 if (!opcode) 224 return 1; 225 226 if (*opcode == TCPOPT_EOL) 227 return 1; 228 if (*opcode == TCPOPT_NOP) 229 return 0; 230 231 opsize = next(ctx, 1); 232 if (!opsize || *opsize < 2) 233 return 1; 234 235 switch (*opcode) { 236 case TCPOPT_WINDOW: 237 wscale = next(ctx, 1); 238 if (!wscale) 239 return 1; 240 if (*opsize == TCPOLEN_WINDOW) 241 ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; 242 break; 243 case TCPOPT_TIMESTAMP: 244 tsecr = next(ctx, 4); 245 if (!tsecr) 246 return 1; 247 if (*opsize == TCPOLEN_TIMESTAMP) { 248 ctx->option_timestamp = true; 249 /* Client's tsval becomes our tsecr. */ 250 *ctx->tsecr = get_unaligned((__be32 *)tsecr); 251 } 252 break; 253 case TCPOPT_SACK_PERM: 254 if (*opsize == TCPOLEN_SACK_PERM) 255 ctx->option_sack = true; 256 break; 257 } 258 259 ctx->off = off + *opsize; 260 261 return 0; 262 } 263 264 static int tscookie_tcpopt_parse_batch(__u32 index, void *context) 265 { 266 int i; 267 268 for (i = 0; i < 7; i++) 269 if (tscookie_tcpopt_parse(context)) 270 return 1; 271 return 0; 272 } 273 274 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 275 __u16 tcp_len, __be32 *tsval, 276 __be32 *tsecr, void *data, void *data_end) 277 { 278 struct tcpopt_context loop_ctx = { 279 .data = data, 280 .data_end = data_end, 281 .tsecr = tsecr, 282 .wscale = TS_OPT_WSCALE_MASK, 283 .option_timestamp = false, 284 .option_sack = false, 285 /* Note: currently verifier would track .off as unbound scalar. 286 * In case if verifier would at some point get smarter and 287 * compute bounded value for this var, beware that it might 288 * hinder bpf_loop() convergence validation. 289 */ 290 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, 291 }; 292 u32 cookie; 293 294 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); 295 296 if (!loop_ctx.option_timestamp) 297 return false; 298 299 cookie = tcp_clock_ms() & ~TSMASK; 300 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; 301 if (loop_ctx.option_sack) 302 cookie |= TS_OPT_SACK; 303 if (tcp_header->ece && tcp_header->cwr) 304 cookie |= TS_OPT_ECN; 305 *tsval = bpf_htonl(cookie); 306 307 return true; 308 } 309 310 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, 311 __u8 *ttl, bool ipv6) 312 { 313 __u32 key = 0; 314 __u64 *value; 315 316 value = bpf_map_lookup_elem(&values, &key); 317 if (value && *value != 0) { 318 if (ipv6) 319 *mss = (*value >> 32) & 0xffff; 320 else 321 *mss = *value & 0xffff; 322 *wscale = (*value >> 16) & 0xf; 323 *ttl = (*value >> 24) & 0xff; 324 return; 325 } 326 327 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; 328 *wscale = DEFAULT_WSCALE; 329 *ttl = DEFAULT_TTL; 330 } 331 332 static __always_inline void values_inc_synacks(void) 333 { 334 __u32 key = 1; 335 __u64 *value; 336 337 value = bpf_map_lookup_elem(&values, &key); 338 if (value) 339 __sync_fetch_and_add(value, 1); 340 } 341 342 static __always_inline bool check_port_allowed(__u16 port) 343 { 344 __u32 i; 345 346 for (i = 0; i < MAX_ALLOWED_PORTS; i++) { 347 __u32 key = i; 348 __u16 *value; 349 350 value = bpf_map_lookup_elem(&allowed_ports, &key); 351 352 if (!value) 353 break; 354 /* 0 is a terminator value. Check it first to avoid matching on 355 * a forbidden port == 0 and returning true. 356 */ 357 if (*value == 0) 358 break; 359 360 if (*value == port) 361 return true; 362 } 363 364 return false; 365 } 366 367 struct header_pointers { 368 struct ethhdr *eth; 369 struct iphdr *ipv4; 370 struct ipv6hdr *ipv6; 371 struct tcphdr *tcp; 372 __u16 tcp_len; 373 }; 374 375 static __always_inline int tcp_dissect(void *data, void *data_end, 376 struct header_pointers *hdr) 377 { 378 hdr->eth = data; 379 if (hdr->eth + 1 > data_end) 380 return XDP_DROP; 381 382 switch (bpf_ntohs(hdr->eth->h_proto)) { 383 case ETH_P_IP: 384 hdr->ipv6 = NULL; 385 386 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 387 if (hdr->ipv4 + 1 > data_end) 388 return XDP_DROP; 389 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) 390 return XDP_DROP; 391 if (hdr->ipv4->version != 4) 392 return XDP_DROP; 393 394 if (hdr->ipv4->protocol != IPPROTO_TCP) 395 return XDP_PASS; 396 397 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 398 break; 399 case ETH_P_IPV6: 400 hdr->ipv4 = NULL; 401 402 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 403 if (hdr->ipv6 + 1 > data_end) 404 return XDP_DROP; 405 if (hdr->ipv6->version != 6) 406 return XDP_DROP; 407 408 /* XXX: Extension headers are not supported and could circumvent 409 * XDP SYN flood protection. 410 */ 411 if (hdr->ipv6->nexthdr != NEXTHDR_TCP) 412 return XDP_PASS; 413 414 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 415 break; 416 default: 417 /* XXX: VLANs will circumvent XDP SYN flood protection. */ 418 return XDP_PASS; 419 } 420 421 if (hdr->tcp + 1 > data_end) 422 return XDP_DROP; 423 hdr->tcp_len = hdr->tcp->doff * 4; 424 if (hdr->tcp_len < sizeof(*hdr->tcp)) 425 return XDP_DROP; 426 427 return XDP_TX; 428 } 429 430 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) 431 { 432 struct bpf_ct_opts___local ct_lookup_opts = { 433 .netns_id = BPF_F_CURRENT_NETNS, 434 .l4proto = IPPROTO_TCP, 435 }; 436 struct bpf_sock_tuple tup = {}; 437 struct nf_conn *ct; 438 __u32 tup_size; 439 440 if (hdr->ipv4) { 441 /* TCP doesn't normally use fragments, and XDP can't reassemble 442 * them. 443 */ 444 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) 445 return XDP_DROP; 446 447 tup.ipv4.saddr = hdr->ipv4->saddr; 448 tup.ipv4.daddr = hdr->ipv4->daddr; 449 tup.ipv4.sport = hdr->tcp->source; 450 tup.ipv4.dport = hdr->tcp->dest; 451 tup_size = sizeof(tup.ipv4); 452 } else if (hdr->ipv6) { 453 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); 454 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); 455 tup.ipv6.sport = hdr->tcp->source; 456 tup.ipv6.dport = hdr->tcp->dest; 457 tup_size = sizeof(tup.ipv6); 458 } else { 459 /* The verifier can't track that either ipv4 or ipv6 is not 460 * NULL. 461 */ 462 return XDP_ABORTED; 463 } 464 if (xdp) 465 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 466 else 467 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 468 if (ct) { 469 unsigned long status = ct->status; 470 471 bpf_ct_release(ct); 472 if (status & IPS_CONFIRMED) 473 return XDP_PASS; 474 } else if (ct_lookup_opts.error != -ENOENT) { 475 return XDP_ABORTED; 476 } 477 478 /* error == -ENOENT || !(status & IPS_CONFIRMED) */ 479 return XDP_TX; 480 } 481 482 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, 483 __u8 wscale) 484 { 485 __be32 *start = buf; 486 487 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 488 489 if (!tsopt) 490 return buf - start; 491 492 if (tsopt[0] & bpf_htonl(1 << 4)) 493 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | 494 (TCPOLEN_SACK_PERM << 16) | 495 (TCPOPT_TIMESTAMP << 8) | 496 TCPOLEN_TIMESTAMP); 497 else 498 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 499 (TCPOPT_NOP << 16) | 500 (TCPOPT_TIMESTAMP << 8) | 501 TCPOLEN_TIMESTAMP); 502 *buf++ = tsopt[0]; 503 *buf++ = tsopt[1]; 504 505 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) 506 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 507 (TCPOPT_WINDOW << 16) | 508 (TCPOLEN_WINDOW << 8) | 509 wscale); 510 511 return buf - start; 512 } 513 514 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, 515 __u32 cookie, __be32 *tsopt, 516 __u16 mss, __u8 wscale) 517 { 518 void *tcp_options; 519 520 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; 521 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) 522 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; 523 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ 524 swap(tcp_header->source, tcp_header->dest); 525 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); 526 tcp_header->seq = bpf_htonl(cookie); 527 tcp_header->window = 0; 528 tcp_header->urg_ptr = 0; 529 tcp_header->check = 0; /* Calculate checksum later. */ 530 531 tcp_options = (void *)(tcp_header + 1); 532 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); 533 } 534 535 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, 536 __u32 cookie, __be32 *tsopt) 537 { 538 __u8 wscale; 539 __u16 mss; 540 __u8 ttl; 541 542 values_get_tcpipopts(&mss, &wscale, &ttl, false); 543 544 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 545 546 swap(hdr->ipv4->saddr, hdr->ipv4->daddr); 547 hdr->ipv4->check = 0; /* Calculate checksum later. */ 548 hdr->ipv4->tos = 0; 549 hdr->ipv4->id = 0; 550 hdr->ipv4->ttl = ttl; 551 552 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 553 554 hdr->tcp_len = hdr->tcp->doff * 4; 555 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); 556 } 557 558 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, 559 __u32 cookie, __be32 *tsopt) 560 { 561 __u8 wscale; 562 __u16 mss; 563 __u8 ttl; 564 565 values_get_tcpipopts(&mss, &wscale, &ttl, true); 566 567 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 568 569 swap(hdr->ipv6->saddr, hdr->ipv6->daddr); 570 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); 571 hdr->ipv6->hop_limit = ttl; 572 573 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 574 575 hdr->tcp_len = hdr->tcp->doff * 4; 576 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); 577 } 578 579 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, 580 void *ctx, 581 void *data, void *data_end, 582 bool xdp) 583 { 584 __u32 old_pkt_size, new_pkt_size; 585 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the 586 * BPF verifier if tsopt is not volatile. Volatile forces it to store 587 * the pointer value and use it directly, otherwise tcp_mkoptions is 588 * (mis)compiled like this: 589 * if (!tsopt) 590 * return buf - start; 591 * reg = stored_return_value_of_tscookie_init; 592 * if (reg) 593 * tsopt = tsopt_buf; 594 * else 595 * tsopt = NULL; 596 * ... 597 * *buf++ = tsopt[1]; 598 * It creates a dead branch where tsopt is assigned NULL, but the 599 * verifier can't prove it's dead and blocks the program. 600 */ 601 __be32 * volatile tsopt = NULL; 602 __be32 tsopt_buf[2] = {}; 603 __u16 ip_len; 604 __u32 cookie; 605 __s64 value; 606 607 /* Checksum is not yet verified, but both checksum failure and TCP 608 * header checks return XDP_DROP, so the order doesn't matter. 609 */ 610 if (hdr->tcp->fin || hdr->tcp->rst) 611 return XDP_DROP; 612 613 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked 614 * ports. 615 */ 616 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) 617 return XDP_DROP; 618 619 if (hdr->ipv4) { 620 /* Check the IPv4 and TCP checksums before creating a SYNACK. */ 621 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); 622 if (value < 0) 623 return XDP_ABORTED; 624 if (csum_fold(value) != 0) 625 return XDP_DROP; /* Bad IPv4 checksum. */ 626 627 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 628 if (value < 0) 629 return XDP_ABORTED; 630 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, 631 hdr->tcp_len, IPPROTO_TCP, value) != 0) 632 return XDP_DROP; /* Bad TCP checksum. */ 633 634 ip_len = sizeof(*hdr->ipv4); 635 636 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, 637 hdr->tcp_len); 638 } else if (hdr->ipv6) { 639 /* Check the TCP checksum before creating a SYNACK. */ 640 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 641 if (value < 0) 642 return XDP_ABORTED; 643 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, 644 hdr->tcp_len, IPPROTO_TCP, value) != 0) 645 return XDP_DROP; /* Bad TCP checksum. */ 646 647 ip_len = sizeof(*hdr->ipv6); 648 649 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, 650 hdr->tcp_len); 651 } else { 652 return XDP_ABORTED; 653 } 654 655 if (value < 0) 656 return XDP_ABORTED; 657 cookie = (__u32)value; 658 659 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 660 &tsopt_buf[0], &tsopt_buf[1], data, data_end)) 661 tsopt = tsopt_buf; 662 663 /* Check that there is enough space for a SYNACK. It also covers 664 * the check that the destination of the __builtin_memmove below 665 * doesn't overflow. 666 */ 667 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) 668 return XDP_ABORTED; 669 670 if (hdr->ipv4) { 671 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { 672 struct tcphdr *new_tcp_header; 673 674 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); 675 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); 676 hdr->tcp = new_tcp_header; 677 678 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; 679 } 680 681 tcpv4_gen_synack(hdr, cookie, tsopt); 682 } else if (hdr->ipv6) { 683 tcpv6_gen_synack(hdr, cookie, tsopt); 684 } else { 685 return XDP_ABORTED; 686 } 687 688 /* Recalculate checksums. */ 689 hdr->tcp->check = 0; 690 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 691 if (value < 0) 692 return XDP_ABORTED; 693 if (hdr->ipv4) { 694 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, 695 hdr->ipv4->daddr, 696 hdr->tcp_len, 697 IPPROTO_TCP, 698 value); 699 700 hdr->ipv4->check = 0; 701 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); 702 if (value < 0) 703 return XDP_ABORTED; 704 hdr->ipv4->check = csum_fold(value); 705 } else if (hdr->ipv6) { 706 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, 707 &hdr->ipv6->daddr, 708 hdr->tcp_len, 709 IPPROTO_TCP, 710 value); 711 } else { 712 return XDP_ABORTED; 713 } 714 715 /* Set the new packet size. */ 716 old_pkt_size = data_end - data; 717 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; 718 if (xdp) { 719 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) 720 return XDP_ABORTED; 721 } else { 722 if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) 723 return XDP_ABORTED; 724 } 725 726 values_inc_synacks(); 727 728 return XDP_TX; 729 } 730 731 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) 732 { 733 int err; 734 735 if (hdr->tcp->rst) 736 return XDP_DROP; 737 738 if (hdr->ipv4) 739 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); 740 else if (hdr->ipv6) 741 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); 742 else 743 return XDP_ABORTED; 744 if (err) 745 return XDP_DROP; 746 747 return XDP_PASS; 748 } 749 750 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, 751 struct header_pointers *hdr, bool xdp) 752 { 753 int ret; 754 755 ret = tcp_dissect(data, data_end, hdr); 756 if (ret != XDP_TX) 757 return ret; 758 759 ret = tcp_lookup(ctx, hdr, xdp); 760 if (ret != XDP_TX) 761 return ret; 762 763 /* Packet is TCP and doesn't belong to an established connection. */ 764 765 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) 766 return XDP_DROP; 767 768 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len 769 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. 770 */ 771 if (xdp) { 772 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) 773 return XDP_ABORTED; 774 } else { 775 /* Without volatile the verifier throws this error: 776 * R9 32-bit pointer arithmetic prohibited 777 */ 778 volatile u64 old_len = data_end - data; 779 780 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) 781 return XDP_ABORTED; 782 } 783 784 return XDP_TX; 785 } 786 787 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, 788 struct header_pointers *hdr, bool xdp) 789 { 790 if (hdr->ipv4) { 791 hdr->eth = data; 792 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 793 /* IPV4_MAXLEN is needed when calculating checksum. 794 * At least sizeof(struct iphdr) is needed here to access ihl. 795 */ 796 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) 797 return XDP_ABORTED; 798 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 799 } else if (hdr->ipv6) { 800 hdr->eth = data; 801 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 802 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 803 } else { 804 return XDP_ABORTED; 805 } 806 807 if ((void *)hdr->tcp + TCP_MAXLEN > data_end) 808 return XDP_ABORTED; 809 810 /* We run out of registers, tcp_len gets spilled to the stack, and the 811 * verifier forgets its min and max values checked above in tcp_dissect. 812 */ 813 hdr->tcp_len = hdr->tcp->doff * 4; 814 if (hdr->tcp_len < sizeof(*hdr->tcp)) 815 return XDP_ABORTED; 816 817 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : 818 syncookie_handle_ack(hdr); 819 } 820 821 SEC("xdp") 822 int syncookie_xdp(struct xdp_md *ctx) 823 { 824 void *data_end = (void *)(long)ctx->data_end; 825 void *data = (void *)(long)ctx->data; 826 struct header_pointers hdr; 827 int ret; 828 829 ret = syncookie_part1(ctx, data, data_end, &hdr, true); 830 if (ret != XDP_TX) 831 return ret; 832 833 data_end = (void *)(long)ctx->data_end; 834 data = (void *)(long)ctx->data; 835 836 return syncookie_part2(ctx, data, data_end, &hdr, true); 837 } 838 839 SEC("tc") 840 int syncookie_tc(struct __sk_buff *skb) 841 { 842 void *data_end = (void *)(long)skb->data_end; 843 void *data = (void *)(long)skb->data; 844 struct header_pointers hdr; 845 int ret; 846 847 ret = syncookie_part1(skb, data, data_end, &hdr, false); 848 if (ret != XDP_TX) 849 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; 850 851 data_end = (void *)(long)skb->data_end; 852 data = (void *)(long)skb->data; 853 854 ret = syncookie_part2(skb, data, data_end, &hdr, false); 855 switch (ret) { 856 case XDP_PASS: 857 return TC_ACT_OK; 858 case XDP_TX: 859 return bpf_redirect(skb->ifindex, 0); 860 default: 861 return TC_ACT_SHOT; 862 } 863 } 864 865 char _license[] SEC("license") = "GPL"; 866