1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ 3 4 #include "vmlinux.h" 5 6 #include <bpf/bpf_helpers.h> 7 #include <bpf/bpf_endian.h> 8 #include <asm/errno.h> 9 10 #define TC_ACT_OK 0 11 #define TC_ACT_SHOT 2 12 13 #define NSEC_PER_SEC 1000000000L 14 15 #define ETH_ALEN 6 16 #define ETH_P_IP 0x0800 17 #define ETH_P_IPV6 0x86DD 18 19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) 20 21 #define IP_DF 0x4000 22 #define IP_MF 0x2000 23 #define IP_OFFSET 0x1fff 24 25 #define NEXTHDR_TCP 6 26 27 #define TCPOPT_NOP 1 28 #define TCPOPT_EOL 0 29 #define TCPOPT_MSS 2 30 #define TCPOPT_WINDOW 3 31 #define TCPOPT_SACK_PERM 4 32 #define TCPOPT_TIMESTAMP 8 33 34 #define TCPOLEN_MSS 4 35 #define TCPOLEN_WINDOW 3 36 #define TCPOLEN_SACK_PERM 2 37 #define TCPOLEN_TIMESTAMP 10 38 39 #define TCP_TS_HZ 1000 40 #define TS_OPT_WSCALE_MASK 0xf 41 #define TS_OPT_SACK (1 << 4) 42 #define TS_OPT_ECN (1 << 5) 43 #define TSBITS 6 44 #define TSMASK (((__u32)1 << TSBITS) - 1) 45 #define TCP_MAX_WSCALE 14U 46 47 #define IPV4_MAXLEN 60 48 #define TCP_MAXLEN 60 49 50 #define DEFAULT_MSS4 1460 51 #define DEFAULT_MSS6 1440 52 #define DEFAULT_WSCALE 7 53 #define DEFAULT_TTL 64 54 #define MAX_ALLOWED_PORTS 8 55 56 #define MAX_PACKET_OFF 0xffff 57 58 #define swap(a, b) \ 59 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 60 61 #define __get_unaligned_t(type, ptr) ({ \ 62 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ 63 __pptr->x; \ 64 }) 65 66 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) 67 68 struct { 69 __uint(type, BPF_MAP_TYPE_ARRAY); 70 __type(key, __u32); 71 __type(value, __u64); 72 __uint(max_entries, 2); 73 } values SEC(".maps"); 74 75 struct { 76 __uint(type, BPF_MAP_TYPE_ARRAY); 77 __type(key, __u32); 78 __type(value, __u16); 79 __uint(max_entries, MAX_ALLOWED_PORTS); 80 } allowed_ports SEC(".maps"); 81 82 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in 83 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. 84 */ 85 86 struct bpf_ct_opts___local { 87 s32 netns_id; 88 s32 error; 89 u8 l4proto; 90 u8 dir; 91 u8 reserved[2]; 92 } __attribute__((preserve_access_index)); 93 94 #define BPF_F_CURRENT_NETNS (-1) 95 96 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, 97 struct bpf_sock_tuple *bpf_tuple, 98 __u32 len_tuple, 99 struct bpf_ct_opts___local *opts, 100 __u32 len_opts) __ksym; 101 102 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, 103 struct bpf_sock_tuple *bpf_tuple, 104 u32 len_tuple, 105 struct bpf_ct_opts___local *opts, 106 u32 len_opts) __ksym; 107 108 extern void bpf_ct_release(struct nf_conn *ct) __ksym; 109 110 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) 111 { 112 __u8 tmp[ETH_ALEN]; 113 114 __builtin_memcpy(tmp, a, ETH_ALEN); 115 __builtin_memcpy(a, b, ETH_ALEN); 116 __builtin_memcpy(b, tmp, ETH_ALEN); 117 } 118 119 static __always_inline __u16 csum_fold(__u32 csum) 120 { 121 csum = (csum & 0xffff) + (csum >> 16); 122 csum = (csum & 0xffff) + (csum >> 16); 123 return (__u16)~csum; 124 } 125 126 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, 127 __u32 len, __u8 proto, 128 __u32 csum) 129 { 130 __u64 s = csum; 131 132 s += (__u32)saddr; 133 s += (__u32)daddr; 134 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 135 s += proto + len; 136 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 137 s += (proto + len) << 8; 138 #else 139 #error Unknown endian 140 #endif 141 s = (s & 0xffffffff) + (s >> 32); 142 s = (s & 0xffffffff) + (s >> 32); 143 144 return csum_fold((__u32)s); 145 } 146 147 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, 148 const struct in6_addr *daddr, 149 __u32 len, __u8 proto, __u32 csum) 150 { 151 __u64 sum = csum; 152 int i; 153 154 #pragma unroll 155 for (i = 0; i < 4; i++) 156 sum += (__u32)saddr->in6_u.u6_addr32[i]; 157 158 #pragma unroll 159 for (i = 0; i < 4; i++) 160 sum += (__u32)daddr->in6_u.u6_addr32[i]; 161 162 /* Don't combine additions to avoid 32-bit overflow. */ 163 sum += bpf_htonl(len); 164 sum += bpf_htonl(proto); 165 166 sum = (sum & 0xffffffff) + (sum >> 32); 167 sum = (sum & 0xffffffff) + (sum >> 32); 168 169 return csum_fold((__u32)sum); 170 } 171 172 static __always_inline __u64 tcp_clock_ns(void) 173 { 174 return bpf_ktime_get_ns(); 175 } 176 177 static __always_inline __u32 tcp_ns_to_ts(__u64 ns) 178 { 179 return ns / (NSEC_PER_SEC / TCP_TS_HZ); 180 } 181 182 static __always_inline __u32 tcp_clock_ms(void) 183 { 184 return tcp_ns_to_ts(tcp_clock_ns()); 185 } 186 187 struct tcpopt_context { 188 void *data; 189 void *data_end; 190 __be32 *tsecr; 191 __u8 wscale; 192 bool option_timestamp; 193 bool option_sack; 194 __u32 off; 195 }; 196 197 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) 198 { 199 __u64 off = ctx->off; 200 __u8 *data; 201 202 /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ 203 if (off > MAX_PACKET_OFF - sz) 204 return NULL; 205 206 data = ctx->data + off; 207 barrier_var(data); 208 if (data + sz >= ctx->data_end) 209 return NULL; 210 211 ctx->off += sz; 212 return data; 213 } 214 215 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 216 { 217 __u8 *opcode, *opsize, *wscale, *tsecr; 218 __u32 off = ctx->off; 219 220 opcode = next(ctx, 1); 221 if (!opcode) 222 return 1; 223 224 if (*opcode == TCPOPT_EOL) 225 return 1; 226 if (*opcode == TCPOPT_NOP) 227 return 0; 228 229 opsize = next(ctx, 1); 230 if (!opsize || *opsize < 2) 231 return 1; 232 233 switch (*opcode) { 234 case TCPOPT_WINDOW: 235 wscale = next(ctx, 1); 236 if (!wscale) 237 return 1; 238 if (*opsize == TCPOLEN_WINDOW) 239 ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; 240 break; 241 case TCPOPT_TIMESTAMP: 242 tsecr = next(ctx, 4); 243 if (!tsecr) 244 return 1; 245 if (*opsize == TCPOLEN_TIMESTAMP) { 246 ctx->option_timestamp = true; 247 /* Client's tsval becomes our tsecr. */ 248 *ctx->tsecr = get_unaligned((__be32 *)tsecr); 249 } 250 break; 251 case TCPOPT_SACK_PERM: 252 if (*opsize == TCPOLEN_SACK_PERM) 253 ctx->option_sack = true; 254 break; 255 } 256 257 ctx->off = off + *opsize; 258 259 return 0; 260 } 261 262 static int tscookie_tcpopt_parse_batch(__u32 index, void *context) 263 { 264 int i; 265 266 for (i = 0; i < 7; i++) 267 if (tscookie_tcpopt_parse(context)) 268 return 1; 269 return 0; 270 } 271 272 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 273 __u16 tcp_len, __be32 *tsval, 274 __be32 *tsecr, void *data, void *data_end) 275 { 276 struct tcpopt_context loop_ctx = { 277 .data = data, 278 .data_end = data_end, 279 .tsecr = tsecr, 280 .wscale = TS_OPT_WSCALE_MASK, 281 .option_timestamp = false, 282 .option_sack = false, 283 /* Note: currently verifier would track .off as unbound scalar. 284 * In case if verifier would at some point get smarter and 285 * compute bounded value for this var, beware that it might 286 * hinder bpf_loop() convergence validation. 287 */ 288 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, 289 }; 290 u32 cookie; 291 292 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); 293 294 if (!loop_ctx.option_timestamp) 295 return false; 296 297 cookie = tcp_clock_ms() & ~TSMASK; 298 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; 299 if (loop_ctx.option_sack) 300 cookie |= TS_OPT_SACK; 301 if (tcp_header->ece && tcp_header->cwr) 302 cookie |= TS_OPT_ECN; 303 *tsval = bpf_htonl(cookie); 304 305 return true; 306 } 307 308 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, 309 __u8 *ttl, bool ipv6) 310 { 311 __u32 key = 0; 312 __u64 *value; 313 314 value = bpf_map_lookup_elem(&values, &key); 315 if (value && *value != 0) { 316 if (ipv6) 317 *mss = (*value >> 32) & 0xffff; 318 else 319 *mss = *value & 0xffff; 320 *wscale = (*value >> 16) & 0xf; 321 *ttl = (*value >> 24) & 0xff; 322 return; 323 } 324 325 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; 326 *wscale = DEFAULT_WSCALE; 327 *ttl = DEFAULT_TTL; 328 } 329 330 static __always_inline void values_inc_synacks(void) 331 { 332 __u32 key = 1; 333 __u64 *value; 334 335 value = bpf_map_lookup_elem(&values, &key); 336 if (value) 337 __sync_fetch_and_add(value, 1); 338 } 339 340 static __always_inline bool check_port_allowed(__u16 port) 341 { 342 __u32 i; 343 344 for (i = 0; i < MAX_ALLOWED_PORTS; i++) { 345 __u32 key = i; 346 __u16 *value; 347 348 value = bpf_map_lookup_elem(&allowed_ports, &key); 349 350 if (!value) 351 break; 352 /* 0 is a terminator value. Check it first to avoid matching on 353 * a forbidden port == 0 and returning true. 354 */ 355 if (*value == 0) 356 break; 357 358 if (*value == port) 359 return true; 360 } 361 362 return false; 363 } 364 365 struct header_pointers { 366 struct ethhdr *eth; 367 struct iphdr *ipv4; 368 struct ipv6hdr *ipv6; 369 struct tcphdr *tcp; 370 __u16 tcp_len; 371 }; 372 373 static __always_inline int tcp_dissect(void *data, void *data_end, 374 struct header_pointers *hdr) 375 { 376 hdr->eth = data; 377 if (hdr->eth + 1 > data_end) 378 return XDP_DROP; 379 380 switch (bpf_ntohs(hdr->eth->h_proto)) { 381 case ETH_P_IP: 382 hdr->ipv6 = NULL; 383 384 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 385 if (hdr->ipv4 + 1 > data_end) 386 return XDP_DROP; 387 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) 388 return XDP_DROP; 389 if (hdr->ipv4->version != 4) 390 return XDP_DROP; 391 392 if (hdr->ipv4->protocol != IPPROTO_TCP) 393 return XDP_PASS; 394 395 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 396 break; 397 case ETH_P_IPV6: 398 hdr->ipv4 = NULL; 399 400 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 401 if (hdr->ipv6 + 1 > data_end) 402 return XDP_DROP; 403 if (hdr->ipv6->version != 6) 404 return XDP_DROP; 405 406 /* XXX: Extension headers are not supported and could circumvent 407 * XDP SYN flood protection. 408 */ 409 if (hdr->ipv6->nexthdr != NEXTHDR_TCP) 410 return XDP_PASS; 411 412 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 413 break; 414 default: 415 /* XXX: VLANs will circumvent XDP SYN flood protection. */ 416 return XDP_PASS; 417 } 418 419 if (hdr->tcp + 1 > data_end) 420 return XDP_DROP; 421 hdr->tcp_len = hdr->tcp->doff * 4; 422 if (hdr->tcp_len < sizeof(*hdr->tcp)) 423 return XDP_DROP; 424 425 return XDP_TX; 426 } 427 428 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) 429 { 430 struct bpf_ct_opts___local ct_lookup_opts = { 431 .netns_id = BPF_F_CURRENT_NETNS, 432 .l4proto = IPPROTO_TCP, 433 }; 434 struct bpf_sock_tuple tup = {}; 435 struct nf_conn *ct; 436 __u32 tup_size; 437 438 if (hdr->ipv4) { 439 /* TCP doesn't normally use fragments, and XDP can't reassemble 440 * them. 441 */ 442 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) 443 return XDP_DROP; 444 445 tup.ipv4.saddr = hdr->ipv4->saddr; 446 tup.ipv4.daddr = hdr->ipv4->daddr; 447 tup.ipv4.sport = hdr->tcp->source; 448 tup.ipv4.dport = hdr->tcp->dest; 449 tup_size = sizeof(tup.ipv4); 450 } else if (hdr->ipv6) { 451 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); 452 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); 453 tup.ipv6.sport = hdr->tcp->source; 454 tup.ipv6.dport = hdr->tcp->dest; 455 tup_size = sizeof(tup.ipv6); 456 } else { 457 /* The verifier can't track that either ipv4 or ipv6 is not 458 * NULL. 459 */ 460 return XDP_ABORTED; 461 } 462 if (xdp) 463 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 464 else 465 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 466 if (ct) { 467 unsigned long status = ct->status; 468 469 bpf_ct_release(ct); 470 if (status & IPS_CONFIRMED) 471 return XDP_PASS; 472 } else if (ct_lookup_opts.error != -ENOENT) { 473 return XDP_ABORTED; 474 } 475 476 /* error == -ENOENT || !(status & IPS_CONFIRMED) */ 477 return XDP_TX; 478 } 479 480 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, 481 __u8 wscale) 482 { 483 __be32 *start = buf; 484 485 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 486 487 if (!tsopt) 488 return buf - start; 489 490 if (tsopt[0] & bpf_htonl(1 << 4)) 491 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | 492 (TCPOLEN_SACK_PERM << 16) | 493 (TCPOPT_TIMESTAMP << 8) | 494 TCPOLEN_TIMESTAMP); 495 else 496 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 497 (TCPOPT_NOP << 16) | 498 (TCPOPT_TIMESTAMP << 8) | 499 TCPOLEN_TIMESTAMP); 500 *buf++ = tsopt[0]; 501 *buf++ = tsopt[1]; 502 503 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) 504 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 505 (TCPOPT_WINDOW << 16) | 506 (TCPOLEN_WINDOW << 8) | 507 wscale); 508 509 return buf - start; 510 } 511 512 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, 513 __u32 cookie, __be32 *tsopt, 514 __u16 mss, __u8 wscale) 515 { 516 void *tcp_options; 517 518 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; 519 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) 520 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; 521 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ 522 swap(tcp_header->source, tcp_header->dest); 523 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); 524 tcp_header->seq = bpf_htonl(cookie); 525 tcp_header->window = 0; 526 tcp_header->urg_ptr = 0; 527 tcp_header->check = 0; /* Calculate checksum later. */ 528 529 tcp_options = (void *)(tcp_header + 1); 530 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); 531 } 532 533 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, 534 __u32 cookie, __be32 *tsopt) 535 { 536 __u8 wscale; 537 __u16 mss; 538 __u8 ttl; 539 540 values_get_tcpipopts(&mss, &wscale, &ttl, false); 541 542 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 543 544 swap(hdr->ipv4->saddr, hdr->ipv4->daddr); 545 hdr->ipv4->check = 0; /* Calculate checksum later. */ 546 hdr->ipv4->tos = 0; 547 hdr->ipv4->id = 0; 548 hdr->ipv4->ttl = ttl; 549 550 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 551 552 hdr->tcp_len = hdr->tcp->doff * 4; 553 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); 554 } 555 556 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, 557 __u32 cookie, __be32 *tsopt) 558 { 559 __u8 wscale; 560 __u16 mss; 561 __u8 ttl; 562 563 values_get_tcpipopts(&mss, &wscale, &ttl, true); 564 565 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 566 567 swap(hdr->ipv6->saddr, hdr->ipv6->daddr); 568 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); 569 hdr->ipv6->hop_limit = ttl; 570 571 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 572 573 hdr->tcp_len = hdr->tcp->doff * 4; 574 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); 575 } 576 577 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, 578 void *ctx, 579 void *data, void *data_end, 580 bool xdp) 581 { 582 __u32 old_pkt_size, new_pkt_size; 583 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the 584 * BPF verifier if tsopt is not volatile. Volatile forces it to store 585 * the pointer value and use it directly, otherwise tcp_mkoptions is 586 * (mis)compiled like this: 587 * if (!tsopt) 588 * return buf - start; 589 * reg = stored_return_value_of_tscookie_init; 590 * if (reg) 591 * tsopt = tsopt_buf; 592 * else 593 * tsopt = NULL; 594 * ... 595 * *buf++ = tsopt[1]; 596 * It creates a dead branch where tsopt is assigned NULL, but the 597 * verifier can't prove it's dead and blocks the program. 598 */ 599 __be32 * volatile tsopt = NULL; 600 __be32 tsopt_buf[2] = {}; 601 __u16 ip_len; 602 __u32 cookie; 603 __s64 value; 604 605 /* Checksum is not yet verified, but both checksum failure and TCP 606 * header checks return XDP_DROP, so the order doesn't matter. 607 */ 608 if (hdr->tcp->fin || hdr->tcp->rst) 609 return XDP_DROP; 610 611 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked 612 * ports. 613 */ 614 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) 615 return XDP_DROP; 616 617 if (hdr->ipv4) { 618 /* Check the IPv4 and TCP checksums before creating a SYNACK. */ 619 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); 620 if (value < 0) 621 return XDP_ABORTED; 622 if (csum_fold(value) != 0) 623 return XDP_DROP; /* Bad IPv4 checksum. */ 624 625 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 626 if (value < 0) 627 return XDP_ABORTED; 628 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, 629 hdr->tcp_len, IPPROTO_TCP, value) != 0) 630 return XDP_DROP; /* Bad TCP checksum. */ 631 632 ip_len = sizeof(*hdr->ipv4); 633 634 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, 635 hdr->tcp_len); 636 } else if (hdr->ipv6) { 637 /* Check the TCP checksum before creating a SYNACK. */ 638 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 639 if (value < 0) 640 return XDP_ABORTED; 641 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, 642 hdr->tcp_len, IPPROTO_TCP, value) != 0) 643 return XDP_DROP; /* Bad TCP checksum. */ 644 645 ip_len = sizeof(*hdr->ipv6); 646 647 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, 648 hdr->tcp_len); 649 } else { 650 return XDP_ABORTED; 651 } 652 653 if (value < 0) 654 return XDP_ABORTED; 655 cookie = (__u32)value; 656 657 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 658 &tsopt_buf[0], &tsopt_buf[1], data, data_end)) 659 tsopt = tsopt_buf; 660 661 /* Check that there is enough space for a SYNACK. It also covers 662 * the check that the destination of the __builtin_memmove below 663 * doesn't overflow. 664 */ 665 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) 666 return XDP_ABORTED; 667 668 if (hdr->ipv4) { 669 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { 670 struct tcphdr *new_tcp_header; 671 672 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); 673 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); 674 hdr->tcp = new_tcp_header; 675 676 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; 677 } 678 679 tcpv4_gen_synack(hdr, cookie, tsopt); 680 } else if (hdr->ipv6) { 681 tcpv6_gen_synack(hdr, cookie, tsopt); 682 } else { 683 return XDP_ABORTED; 684 } 685 686 /* Recalculate checksums. */ 687 hdr->tcp->check = 0; 688 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 689 if (value < 0) 690 return XDP_ABORTED; 691 if (hdr->ipv4) { 692 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, 693 hdr->ipv4->daddr, 694 hdr->tcp_len, 695 IPPROTO_TCP, 696 value); 697 698 hdr->ipv4->check = 0; 699 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); 700 if (value < 0) 701 return XDP_ABORTED; 702 hdr->ipv4->check = csum_fold(value); 703 } else if (hdr->ipv6) { 704 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, 705 &hdr->ipv6->daddr, 706 hdr->tcp_len, 707 IPPROTO_TCP, 708 value); 709 } else { 710 return XDP_ABORTED; 711 } 712 713 /* Set the new packet size. */ 714 old_pkt_size = data_end - data; 715 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; 716 if (xdp) { 717 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) 718 return XDP_ABORTED; 719 } else { 720 if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) 721 return XDP_ABORTED; 722 } 723 724 values_inc_synacks(); 725 726 return XDP_TX; 727 } 728 729 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) 730 { 731 int err; 732 733 if (hdr->tcp->rst) 734 return XDP_DROP; 735 736 if (hdr->ipv4) 737 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); 738 else if (hdr->ipv6) 739 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); 740 else 741 return XDP_ABORTED; 742 if (err) 743 return XDP_DROP; 744 745 return XDP_PASS; 746 } 747 748 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, 749 struct header_pointers *hdr, bool xdp) 750 { 751 int ret; 752 753 ret = tcp_dissect(data, data_end, hdr); 754 if (ret != XDP_TX) 755 return ret; 756 757 ret = tcp_lookup(ctx, hdr, xdp); 758 if (ret != XDP_TX) 759 return ret; 760 761 /* Packet is TCP and doesn't belong to an established connection. */ 762 763 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) 764 return XDP_DROP; 765 766 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len 767 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. 768 */ 769 if (xdp) { 770 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) 771 return XDP_ABORTED; 772 } else { 773 /* Without volatile the verifier throws this error: 774 * R9 32-bit pointer arithmetic prohibited 775 */ 776 volatile u64 old_len = data_end - data; 777 778 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) 779 return XDP_ABORTED; 780 } 781 782 return XDP_TX; 783 } 784 785 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, 786 struct header_pointers *hdr, bool xdp) 787 { 788 if (hdr->ipv4) { 789 hdr->eth = data; 790 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 791 /* IPV4_MAXLEN is needed when calculating checksum. 792 * At least sizeof(struct iphdr) is needed here to access ihl. 793 */ 794 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) 795 return XDP_ABORTED; 796 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 797 } else if (hdr->ipv6) { 798 hdr->eth = data; 799 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 800 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 801 } else { 802 return XDP_ABORTED; 803 } 804 805 if ((void *)hdr->tcp + TCP_MAXLEN > data_end) 806 return XDP_ABORTED; 807 808 /* We run out of registers, tcp_len gets spilled to the stack, and the 809 * verifier forgets its min and max values checked above in tcp_dissect. 810 */ 811 hdr->tcp_len = hdr->tcp->doff * 4; 812 if (hdr->tcp_len < sizeof(*hdr->tcp)) 813 return XDP_ABORTED; 814 815 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : 816 syncookie_handle_ack(hdr); 817 } 818 819 SEC("xdp") 820 int syncookie_xdp(struct xdp_md *ctx) 821 { 822 void *data_end = (void *)(long)ctx->data_end; 823 void *data = (void *)(long)ctx->data; 824 struct header_pointers hdr; 825 int ret; 826 827 ret = syncookie_part1(ctx, data, data_end, &hdr, true); 828 if (ret != XDP_TX) 829 return ret; 830 831 data_end = (void *)(long)ctx->data_end; 832 data = (void *)(long)ctx->data; 833 834 return syncookie_part2(ctx, data, data_end, &hdr, true); 835 } 836 837 SEC("tc") 838 int syncookie_tc(struct __sk_buff *skb) 839 { 840 void *data_end = (void *)(long)skb->data_end; 841 void *data = (void *)(long)skb->data; 842 struct header_pointers hdr; 843 int ret; 844 845 ret = syncookie_part1(skb, data, data_end, &hdr, false); 846 if (ret != XDP_TX) 847 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; 848 849 data_end = (void *)(long)skb->data_end; 850 data = (void *)(long)skb->data; 851 852 ret = syncookie_part2(skb, data, data_end, &hdr, false); 853 switch (ret) { 854 case XDP_PASS: 855 return TC_ACT_OK; 856 case XDP_TX: 857 return bpf_redirect(skb->ifindex, 0); 858 default: 859 return TC_ACT_SHOT; 860 } 861 } 862 863 char _license[] SEC("license") = "GPL"; 864