1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 // Copyright (c) 2019, 2020 Cloudflare 3 4 #include <stdbool.h> 5 #include <stddef.h> 6 #include <stdint.h> 7 #include <string.h> 8 9 #include <linux/bpf.h> 10 #include <linux/icmp.h> 11 #include <linux/icmpv6.h> 12 #include <linux/if_ether.h> 13 #include <linux/in.h> 14 #include <linux/ip.h> 15 #include <linux/ipv6.h> 16 #include <linux/pkt_cls.h> 17 #include <linux/tcp.h> 18 #include <netinet/udp.h> 19 20 #include <bpf/bpf_helpers.h> 21 #include <bpf/bpf_endian.h> 22 23 #include "bpf_compiler.h" 24 #include "test_cls_redirect.h" 25 #include "bpf_misc.h" 26 27 #pragma GCC diagnostic ignored "-Waddress-of-packed-member" 28 29 #ifdef SUBPROGS 30 #define INLINING __noinline 31 #else 32 #define INLINING __always_inline 33 #endif 34 35 #define IP_OFFSET_MASK (0x1FFF) 36 #define IP_MF (0x2000) 37 38 char _license[] SEC("license") = "Dual BSD/GPL"; 39 40 /** 41 * Destination port and IP used for UDP encapsulation. 42 */ 43 volatile const __be16 ENCAPSULATION_PORT; 44 volatile const __be32 ENCAPSULATION_IP; 45 46 typedef struct { 47 uint64_t processed_packets_total; 48 uint64_t l3_protocol_packets_total_ipv4; 49 uint64_t l3_protocol_packets_total_ipv6; 50 uint64_t l4_protocol_packets_total_tcp; 51 uint64_t l4_protocol_packets_total_udp; 52 uint64_t accepted_packets_total_syn; 53 uint64_t accepted_packets_total_syn_cookies; 54 uint64_t accepted_packets_total_last_hop; 55 uint64_t accepted_packets_total_icmp_echo_request; 56 uint64_t accepted_packets_total_established; 57 uint64_t forwarded_packets_total_gue; 58 uint64_t forwarded_packets_total_gre; 59 60 uint64_t errors_total_unknown_l3_proto; 61 uint64_t errors_total_unknown_l4_proto; 62 uint64_t errors_total_malformed_ip; 63 uint64_t errors_total_fragmented_ip; 64 uint64_t errors_total_malformed_icmp; 65 uint64_t errors_total_unwanted_icmp; 66 uint64_t errors_total_malformed_icmp_pkt_too_big; 67 uint64_t errors_total_malformed_tcp; 68 uint64_t errors_total_malformed_udp; 69 uint64_t errors_total_icmp_echo_replies; 70 uint64_t errors_total_malformed_encapsulation; 71 uint64_t errors_total_encap_adjust_failed; 72 uint64_t errors_total_encap_buffer_too_small; 73 uint64_t errors_total_redirect_loop; 74 uint64_t errors_total_encap_mtu_violate; 75 } metrics_t; 76 77 typedef enum { 78 INVALID = 0, 79 UNKNOWN, 80 ECHO_REQUEST, 81 SYN, 82 SYN_COOKIE, 83 ESTABLISHED, 84 } verdict_t; 85 86 typedef struct { 87 uint16_t src, dst; 88 } flow_ports_t; 89 90 _Static_assert( 91 sizeof(flow_ports_t) != 92 offsetofend(struct bpf_sock_tuple, ipv4.dport) - 93 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 94 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 95 _Static_assert( 96 sizeof(flow_ports_t) != 97 offsetofend(struct bpf_sock_tuple, ipv6.dport) - 98 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 99 "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 100 101 typedef int ret_t; 102 103 /* This is a bit of a hack. We need a return value which allows us to 104 * indicate that the regular flow of the program should continue, 105 * while allowing functions to use XDP_PASS and XDP_DROP, etc. 106 */ 107 static const ret_t CONTINUE_PROCESSING = -1; 108 109 /* Convenience macro to call functions which return ret_t. 110 */ 111 #define MAYBE_RETURN(x) \ 112 do { \ 113 ret_t __ret = x; \ 114 if (__ret != CONTINUE_PROCESSING) \ 115 return __ret; \ 116 } while (0) 117 118 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), 119 * or not aligned if the arch supports efficient unaligned access. 120 * 121 * Since the verifier ensures that eBPF packet accesses follow these rules, 122 * we can tell LLVM to emit code as if we always had a larger alignment. 123 * It will yell at us if we end up on a platform where this is not valid. 124 */ 125 typedef uint8_t *net_ptr __attribute__((align_value(8))); 126 127 typedef struct buf { 128 struct __sk_buff *skb; 129 net_ptr head; 130 /* NB: tail mustn't have alignment other than 1, otherwise 131 * LLVM will go and eliminate code, e.g. when checking packet lengths. 132 */ 133 uint8_t *const tail; 134 } buf_t; 135 136 static __always_inline size_t buf_off(const buf_t *buf) 137 { 138 /* Clang seems to optimize constructs like 139 * a - b + c 140 * if c is known: 141 * r? = c 142 * r? -= b 143 * r? += a 144 * 145 * This is a problem if a and b are packet pointers, 146 * since the verifier allows subtracting two pointers to 147 * get a scalar, but not a scalar and a pointer. 148 * 149 * Use inline asm to break this optimization. 150 */ 151 size_t off = (size_t)buf->head; 152 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); 153 return off; 154 } 155 156 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) 157 { 158 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { 159 return false; 160 } 161 162 buf->head += len; 163 return true; 164 } 165 166 static __always_inline bool buf_skip(buf_t *buf, const size_t len) 167 { 168 /* Check whether off + len is valid in the non-linear part. */ 169 if (buf_off(buf) + len > buf->skb->len) { 170 return false; 171 } 172 173 buf->head += len; 174 return true; 175 } 176 177 /* Returns a pointer to the start of buf, or NULL if len is 178 * larger than the remaining data. Consumes len bytes on a successful 179 * call. 180 * 181 * If scratch is not NULL, the function will attempt to load non-linear 182 * data via bpf_skb_load_bytes. On success, scratch is returned. 183 */ 184 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) 185 { 186 if (buf->head + len > buf->tail) { 187 if (scratch == NULL) { 188 return NULL; 189 } 190 191 return buf_copy(buf, scratch, len) ? scratch : NULL; 192 } 193 194 void *ptr = buf->head; 195 buf->head += len; 196 return ptr; 197 } 198 199 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) 200 { 201 if (ipv4->ihl <= 5) { 202 return true; 203 } 204 205 return buf_skip(buf, (ipv4->ihl - 5) * 4); 206 } 207 208 static INLINING bool ipv4_is_fragment(const struct iphdr *ip) 209 { 210 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 211 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 212 } 213 214 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) 215 { 216 struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); 217 if (ipv4 == NULL) { 218 return NULL; 219 } 220 221 if (ipv4->ihl < 5) { 222 return NULL; 223 } 224 225 if (!pkt_skip_ipv4_options(pkt, ipv4)) { 226 return NULL; 227 } 228 229 return ipv4; 230 } 231 232 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 233 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) 234 { 235 if (!buf_copy(pkt, ports, sizeof(*ports))) { 236 return false; 237 } 238 239 /* Ports in the L4 headers are reversed, since we are parsing an ICMP 240 * payload which is going towards the eyeball. 241 */ 242 uint16_t dst = ports->src; 243 ports->src = ports->dst; 244 ports->dst = dst; 245 return true; 246 } 247 248 static INLINING uint16_t pkt_checksum_fold(uint32_t csum) 249 { 250 /* The highest reasonable value for an IPv4 header 251 * checksum requires two folds, so we just do that always. 252 */ 253 csum = (csum & 0xffff) + (csum >> 16); 254 csum = (csum & 0xffff) + (csum >> 16); 255 return (uint16_t)~csum; 256 } 257 258 static INLINING void pkt_ipv4_checksum(struct iphdr *iph) 259 { 260 iph->check = 0; 261 262 /* An IP header without options is 20 bytes. Two of those 263 * are the checksum, which we always set to zero. Hence, 264 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 265 * which fits in 32 bit. 266 */ 267 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 268 uint32_t acc = 0; 269 uint16_t *ipw = (uint16_t *)iph; 270 271 __pragma_loop_unroll_full 272 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { 273 acc += ipw[i]; 274 } 275 276 iph->check = pkt_checksum_fold(acc); 277 } 278 279 static INLINING 280 bool pkt_skip_ipv6_extension_headers(buf_t *pkt, 281 const struct ipv6hdr *ipv6, 282 uint8_t *upper_proto, 283 bool *is_fragment) 284 { 285 /* We understand five extension headers. 286 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 287 * headers should occur once, except Destination Options, which may 288 * occur twice. Hence we give up after 6 headers. 289 */ 290 struct { 291 uint8_t next; 292 uint8_t len; 293 } exthdr = { 294 .next = ipv6->nexthdr, 295 }; 296 *is_fragment = false; 297 298 __pragma_loop_unroll_full 299 for (int i = 0; i < 6; i++) { 300 switch (exthdr.next) { 301 case IPPROTO_FRAGMENT: 302 *is_fragment = true; 303 /* NB: We don't check that hdrlen == 0 as per spec. */ 304 /* fallthrough; */ 305 306 case IPPROTO_HOPOPTS: 307 case IPPROTO_ROUTING: 308 case IPPROTO_DSTOPTS: 309 case IPPROTO_MH: 310 if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { 311 return false; 312 } 313 314 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 315 if (!buf_skip(pkt, 316 (exthdr.len + 1) * 8 - sizeof(exthdr))) { 317 return false; 318 } 319 320 /* Decode next header */ 321 break; 322 323 default: 324 /* The next header is not one of the known extension 325 * headers, treat it as the upper layer header. 326 * 327 * This handles IPPROTO_NONE. 328 * 329 * Encapsulating Security Payload (50) and Authentication 330 * Header (51) also end up here (and will trigger an 331 * unknown proto error later). They have a custom header 332 * format and seem too esoteric to care about. 333 */ 334 *upper_proto = exthdr.next; 335 return true; 336 } 337 } 338 339 /* We never found an upper layer header. */ 340 return false; 341 } 342 343 /* This function has to be inlined, because the verifier otherwise rejects it 344 * due to returning a pointer to the stack. This is technically correct, since 345 * scratch is allocated on the stack. However, this usage should be safe since 346 * it's the callers stack after all. 347 */ 348 static __always_inline struct ipv6hdr * 349 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, 350 bool *is_fragment) 351 { 352 struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); 353 if (ipv6 == NULL) { 354 return NULL; 355 } 356 357 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { 358 return NULL; 359 } 360 361 return ipv6; 362 } 363 364 /* Global metrics, per CPU 365 */ 366 struct { 367 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 368 __uint(max_entries, 1); 369 __type(key, unsigned int); 370 __type(value, metrics_t); 371 } metrics_map SEC(".maps"); 372 373 static INLINING metrics_t *get_global_metrics(void) 374 { 375 uint64_t key = 0; 376 return bpf_map_lookup_elem(&metrics_map, &key); 377 } 378 379 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 380 { 381 const int payload_off = 382 sizeof(*encap) + 383 sizeof(struct in_addr) * encap->unigue.hop_count; 384 int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 385 386 // Changing the ethertype if the encapsulated packet is ipv6 387 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 388 encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 389 } 390 391 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 392 BPF_F_ADJ_ROOM_FIXED_GSO | 393 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 394 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 395 return TC_ACT_SHOT; 396 397 return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 398 } 399 400 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, 401 struct in_addr *next_hop, metrics_t *metrics) 402 { 403 metrics->forwarded_packets_total_gre++; 404 405 const int payload_off = 406 sizeof(*encap) + 407 sizeof(struct in_addr) * encap->unigue.hop_count; 408 int32_t encap_overhead = 409 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 410 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 411 uint16_t proto = ETH_P_IP; 412 uint32_t mtu_len = 0; 413 414 /* Loop protection: the inner packet's TTL is decremented as a safeguard 415 * against any forwarding loop. As the only interesting field is the TTL 416 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 417 * as they handle the split packets if needed (no need for the data to be 418 * in the linear section). 419 */ 420 if (encap->gue.proto_ctype == IPPROTO_IPV6) { 421 proto = ETH_P_IPV6; 422 uint8_t ttl; 423 int rc; 424 425 rc = bpf_skb_load_bytes( 426 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 427 &ttl, 1); 428 if (rc != 0) { 429 metrics->errors_total_malformed_encapsulation++; 430 return TC_ACT_SHOT; 431 } 432 433 if (ttl == 0) { 434 metrics->errors_total_redirect_loop++; 435 return TC_ACT_SHOT; 436 } 437 438 ttl--; 439 rc = bpf_skb_store_bytes( 440 skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 441 &ttl, 1, 0); 442 if (rc != 0) { 443 metrics->errors_total_malformed_encapsulation++; 444 return TC_ACT_SHOT; 445 } 446 } else { 447 uint8_t ttl; 448 int rc; 449 450 rc = bpf_skb_load_bytes( 451 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 452 1); 453 if (rc != 0) { 454 metrics->errors_total_malformed_encapsulation++; 455 return TC_ACT_SHOT; 456 } 457 458 if (ttl == 0) { 459 metrics->errors_total_redirect_loop++; 460 return TC_ACT_SHOT; 461 } 462 463 /* IPv4 also has a checksum to patch. While the TTL is only one byte, 464 * this function only works for 2 and 4 bytes arguments (the result is 465 * the same). 466 */ 467 rc = bpf_l3_csum_replace( 468 skb, payload_off + offsetof(struct iphdr, check), ttl, 469 ttl - 1, 2); 470 if (rc != 0) { 471 metrics->errors_total_malformed_encapsulation++; 472 return TC_ACT_SHOT; 473 } 474 475 ttl--; 476 rc = bpf_skb_store_bytes( 477 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 478 0); 479 if (rc != 0) { 480 metrics->errors_total_malformed_encapsulation++; 481 return TC_ACT_SHOT; 482 } 483 } 484 485 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { 486 metrics->errors_total_encap_mtu_violate++; 487 return TC_ACT_SHOT; 488 } 489 490 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 491 BPF_F_ADJ_ROOM_FIXED_GSO | 492 BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 493 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 494 metrics->errors_total_encap_adjust_failed++; 495 return TC_ACT_SHOT; 496 } 497 498 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 499 metrics->errors_total_encap_buffer_too_small++; 500 return TC_ACT_SHOT; 501 } 502 503 buf_t pkt = { 504 .skb = skb, 505 .head = (uint8_t *)(long)skb->data, 506 .tail = (uint8_t *)(long)skb->data_end, 507 }; 508 509 encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); 510 if (encap_gre == NULL) { 511 metrics->errors_total_encap_buffer_too_small++; 512 return TC_ACT_SHOT; 513 } 514 515 encap_gre->ip.protocol = IPPROTO_GRE; 516 encap_gre->ip.daddr = next_hop->s_addr; 517 encap_gre->ip.saddr = ENCAPSULATION_IP; 518 encap_gre->ip.tot_len = 519 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 520 encap_gre->gre.flags = 0; 521 encap_gre->gre.protocol = bpf_htons(proto); 522 pkt_ipv4_checksum((void *)&encap_gre->ip); 523 524 return bpf_redirect(skb->ifindex, 0); 525 } 526 527 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, 528 struct in_addr *next_hop, metrics_t *metrics) 529 { 530 /* swap L2 addresses */ 531 /* This assumes that packets are received from a router. 532 * So just swapping the MAC addresses here will make the packet go back to 533 * the router, which will send it to the appropriate machine. 534 */ 535 unsigned char temp[ETH_ALEN]; 536 memcpy(temp, encap->eth.h_dest, sizeof(temp)); 537 memcpy(encap->eth.h_dest, encap->eth.h_source, 538 sizeof(encap->eth.h_dest)); 539 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 540 541 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 542 encap->unigue.last_hop_gre) { 543 return forward_with_gre(skb, encap, next_hop, metrics); 544 } 545 546 metrics->forwarded_packets_total_gue++; 547 uint32_t old_saddr = encap->ip.saddr; 548 encap->ip.saddr = encap->ip.daddr; 549 encap->ip.daddr = next_hop->s_addr; 550 if (encap->unigue.next_hop < encap->unigue.hop_count) { 551 encap->unigue.next_hop++; 552 } 553 554 /* Remove ip->saddr, add next_hop->s_addr */ 555 const uint64_t off = offsetof(typeof(*encap), ip.check); 556 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 557 if (ret < 0) { 558 return TC_ACT_SHOT; 559 } 560 561 return bpf_redirect(skb->ifindex, 0); 562 } 563 564 static INLINING ret_t skip_next_hops(buf_t *pkt, int n) 565 { 566 switch (n) { 567 case 1: 568 if (!buf_skip(pkt, sizeof(struct in_addr))) 569 return TC_ACT_SHOT; 570 case 0: 571 return CONTINUE_PROCESSING; 572 573 default: 574 return TC_ACT_SHOT; 575 } 576 } 577 578 /* Get the next hop from the GLB header. 579 * 580 * Sets next_hop->s_addr to 0 if there are no more hops left. 581 * pkt is positioned just after the variable length GLB header 582 * iff the call is successful. 583 */ 584 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, 585 struct in_addr *next_hop) 586 { 587 if (encap->unigue.next_hop > encap->unigue.hop_count) { 588 return TC_ACT_SHOT; 589 } 590 591 /* Skip "used" next hops. */ 592 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); 593 594 if (encap->unigue.next_hop == encap->unigue.hop_count) { 595 /* No more next hops, we are at the end of the GLB header. */ 596 next_hop->s_addr = 0; 597 return CONTINUE_PROCESSING; 598 } 599 600 if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { 601 return TC_ACT_SHOT; 602 } 603 604 /* Skip the remaining next hops (may be zero). */ 605 return skip_next_hops(pkt, encap->unigue.hop_count - 606 encap->unigue.next_hop - 1); 607 } 608 609 /* Fill a bpf_sock_tuple to be used with the socket lookup functions. 610 * This is a kludge that let's us work around verifier limitations: 611 * 612 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 613 * 614 * clang will substitute a constant for sizeof, which allows the verifier 615 * to track its value. Based on this, it can figure out the constant 616 * return value, and calling code works while still being "generic" to 617 * IPv4 and IPv6. 618 */ 619 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 620 uint64_t iphlen, uint16_t sport, uint16_t dport) 621 { 622 switch (iphlen) { 623 case sizeof(struct iphdr): { 624 struct iphdr *ipv4 = (struct iphdr *)iph; 625 tuple->ipv4.daddr = ipv4->daddr; 626 tuple->ipv4.saddr = ipv4->saddr; 627 tuple->ipv4.sport = sport; 628 tuple->ipv4.dport = dport; 629 return sizeof(tuple->ipv4); 630 } 631 632 case sizeof(struct ipv6hdr): { 633 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 634 memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 635 sizeof(tuple->ipv6.daddr)); 636 memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 637 sizeof(tuple->ipv6.saddr)); 638 tuple->ipv6.sport = sport; 639 tuple->ipv6.dport = dport; 640 return sizeof(tuple->ipv6); 641 } 642 643 default: 644 return 0; 645 } 646 } 647 648 static INLINING verdict_t classify_tcp(struct __sk_buff *skb, 649 struct bpf_sock_tuple *tuple, uint64_t tuplen, 650 void *iph, struct tcphdr *tcp) 651 { 652 struct bpf_sock *sk = 653 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 654 if (sk == NULL) { 655 return UNKNOWN; 656 } 657 658 if (sk->state != BPF_TCP_LISTEN) { 659 bpf_sk_release(sk); 660 return ESTABLISHED; 661 } 662 663 if (iph != NULL && tcp != NULL) { 664 /* Kludge: we've run out of arguments, but need the length of the ip header. */ 665 uint64_t iphlen = sizeof(struct iphdr); 666 if (tuplen == sizeof(tuple->ipv6)) { 667 iphlen = sizeof(struct ipv6hdr); 668 } 669 670 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 671 sizeof(*tcp)) == 0) { 672 bpf_sk_release(sk); 673 return SYN_COOKIE; 674 } 675 } 676 677 bpf_sk_release(sk); 678 return UNKNOWN; 679 } 680 681 static INLINING verdict_t classify_udp(struct __sk_buff *skb, 682 struct bpf_sock_tuple *tuple, uint64_t tuplen) 683 { 684 struct bpf_sock *sk = 685 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 686 if (sk == NULL) { 687 return UNKNOWN; 688 } 689 690 if (sk->state == BPF_TCP_ESTABLISHED) { 691 bpf_sk_release(sk); 692 return ESTABLISHED; 693 } 694 695 bpf_sk_release(sk); 696 return UNKNOWN; 697 } 698 699 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, 700 struct bpf_sock_tuple *tuple, uint64_t tuplen, 701 metrics_t *metrics) 702 { 703 switch (proto) { 704 case IPPROTO_TCP: 705 return classify_tcp(skb, tuple, tuplen, NULL, NULL); 706 707 case IPPROTO_UDP: 708 return classify_udp(skb, tuple, tuplen); 709 710 default: 711 metrics->errors_total_malformed_icmp++; 712 return INVALID; 713 } 714 } 715 716 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) 717 { 718 struct icmphdr icmp; 719 if (!buf_copy(pkt, &icmp, sizeof(icmp))) { 720 metrics->errors_total_malformed_icmp++; 721 return INVALID; 722 } 723 724 /* We should never receive encapsulated echo replies. */ 725 if (icmp.type == ICMP_ECHOREPLY) { 726 metrics->errors_total_icmp_echo_replies++; 727 return INVALID; 728 } 729 730 if (icmp.type == ICMP_ECHO) { 731 return ECHO_REQUEST; 732 } 733 734 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 735 metrics->errors_total_unwanted_icmp++; 736 return INVALID; 737 } 738 739 struct iphdr _ip4; 740 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 741 if (ipv4 == NULL) { 742 metrics->errors_total_malformed_icmp_pkt_too_big++; 743 return INVALID; 744 } 745 746 /* The source address in the outer IP header is from the entity that 747 * originated the ICMP message. Use the original IP header to restore 748 * the correct flow tuple. 749 */ 750 struct bpf_sock_tuple tuple; 751 tuple.ipv4.saddr = ipv4->daddr; 752 tuple.ipv4.daddr = ipv4->saddr; 753 754 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { 755 metrics->errors_total_malformed_icmp_pkt_too_big++; 756 return INVALID; 757 } 758 759 return classify_icmp(pkt->skb, ipv4->protocol, &tuple, 760 sizeof(tuple.ipv4), metrics); 761 } 762 763 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) 764 { 765 struct icmp6hdr icmp6; 766 if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { 767 metrics->errors_total_malformed_icmp++; 768 return INVALID; 769 } 770 771 /* We should never receive encapsulated echo replies. */ 772 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 773 metrics->errors_total_icmp_echo_replies++; 774 return INVALID; 775 } 776 777 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 778 return ECHO_REQUEST; 779 } 780 781 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 782 metrics->errors_total_unwanted_icmp++; 783 return INVALID; 784 } 785 786 bool is_fragment; 787 uint8_t l4_proto; 788 struct ipv6hdr _ipv6; 789 const struct ipv6hdr *ipv6 = 790 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 791 if (ipv6 == NULL) { 792 metrics->errors_total_malformed_icmp_pkt_too_big++; 793 return INVALID; 794 } 795 796 if (is_fragment) { 797 metrics->errors_total_fragmented_ip++; 798 return INVALID; 799 } 800 801 /* Swap source and dest addresses. */ 802 struct bpf_sock_tuple tuple; 803 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); 804 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); 805 806 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { 807 metrics->errors_total_malformed_icmp_pkt_too_big++; 808 return INVALID; 809 } 810 811 return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), 812 metrics); 813 } 814 815 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, 816 metrics_t *metrics) 817 { 818 metrics->l4_protocol_packets_total_tcp++; 819 820 struct tcphdr _tcp; 821 struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); 822 if (tcp == NULL) { 823 metrics->errors_total_malformed_tcp++; 824 return INVALID; 825 } 826 827 if (tcp->syn) { 828 return SYN; 829 } 830 831 struct bpf_sock_tuple tuple; 832 uint64_t tuplen = 833 fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); 834 return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); 835 } 836 837 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, 838 metrics_t *metrics) 839 { 840 metrics->l4_protocol_packets_total_udp++; 841 842 struct udphdr _udp; 843 struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); 844 if (udph == NULL) { 845 metrics->errors_total_malformed_udp++; 846 return INVALID; 847 } 848 849 struct bpf_sock_tuple tuple; 850 uint64_t tuplen = 851 fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); 852 return classify_udp(pkt->skb, &tuple, tuplen); 853 } 854 855 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) 856 { 857 metrics->l3_protocol_packets_total_ipv4++; 858 859 struct iphdr _ip4; 860 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 861 if (ipv4 == NULL) { 862 metrics->errors_total_malformed_ip++; 863 return INVALID; 864 } 865 866 if (ipv4->version != 4) { 867 metrics->errors_total_malformed_ip++; 868 return INVALID; 869 } 870 871 if (ipv4_is_fragment(ipv4)) { 872 metrics->errors_total_fragmented_ip++; 873 return INVALID; 874 } 875 876 switch (ipv4->protocol) { 877 case IPPROTO_ICMP: 878 return process_icmpv4(pkt, metrics); 879 880 case IPPROTO_TCP: 881 return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); 882 883 case IPPROTO_UDP: 884 return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); 885 886 default: 887 metrics->errors_total_unknown_l4_proto++; 888 return INVALID; 889 } 890 } 891 892 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) 893 { 894 metrics->l3_protocol_packets_total_ipv6++; 895 896 uint8_t l4_proto; 897 bool is_fragment; 898 struct ipv6hdr _ipv6; 899 struct ipv6hdr *ipv6 = 900 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 901 if (ipv6 == NULL) { 902 metrics->errors_total_malformed_ip++; 903 return INVALID; 904 } 905 906 if (ipv6->version != 6) { 907 metrics->errors_total_malformed_ip++; 908 return INVALID; 909 } 910 911 if (is_fragment) { 912 metrics->errors_total_fragmented_ip++; 913 return INVALID; 914 } 915 916 switch (l4_proto) { 917 case IPPROTO_ICMPV6: 918 return process_icmpv6(pkt, metrics); 919 920 case IPPROTO_TCP: 921 return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); 922 923 case IPPROTO_UDP: 924 return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); 925 926 default: 927 metrics->errors_total_unknown_l4_proto++; 928 return INVALID; 929 } 930 } 931 932 SEC("tc") 933 int cls_redirect(struct __sk_buff *skb) 934 { 935 metrics_t *metrics = get_global_metrics(); 936 if (metrics == NULL) { 937 return TC_ACT_SHOT; 938 } 939 940 metrics->processed_packets_total++; 941 942 /* Pass bogus packets as long as we're not sure they're 943 * destined for us. 944 */ 945 if (skb->protocol != bpf_htons(ETH_P_IP)) { 946 return TC_ACT_OK; 947 } 948 949 encap_headers_t *encap; 950 951 /* Make sure that all encapsulation headers are available in 952 * the linear portion of the skb. This makes it easy to manipulate them. 953 */ 954 if (bpf_skb_pull_data(skb, sizeof(*encap))) { 955 return TC_ACT_OK; 956 } 957 958 buf_t pkt = { 959 .skb = skb, 960 .head = (uint8_t *)(long)skb->data, 961 .tail = (uint8_t *)(long)skb->data_end, 962 }; 963 964 encap = buf_assign(&pkt, sizeof(*encap), NULL); 965 if (encap == NULL) { 966 return TC_ACT_OK; 967 } 968 969 if (encap->ip.ihl != 5) { 970 /* We never have any options. */ 971 return TC_ACT_OK; 972 } 973 974 if (encap->ip.daddr != ENCAPSULATION_IP || 975 encap->ip.protocol != IPPROTO_UDP) { 976 return TC_ACT_OK; 977 } 978 979 /* TODO Check UDP length? */ 980 if (encap->udp.dest != ENCAPSULATION_PORT) { 981 return TC_ACT_OK; 982 } 983 984 /* We now know that the packet is destined to us, we can 985 * drop bogus ones. 986 */ 987 if (ipv4_is_fragment((void *)&encap->ip)) { 988 metrics->errors_total_fragmented_ip++; 989 return TC_ACT_SHOT; 990 } 991 992 if (encap->gue.variant != 0) { 993 metrics->errors_total_malformed_encapsulation++; 994 return TC_ACT_SHOT; 995 } 996 997 if (encap->gue.control != 0) { 998 metrics->errors_total_malformed_encapsulation++; 999 return TC_ACT_SHOT; 1000 } 1001 1002 if (encap->gue.flags != 0) { 1003 metrics->errors_total_malformed_encapsulation++; 1004 return TC_ACT_SHOT; 1005 } 1006 1007 if (encap->gue.hlen != 1008 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 1009 metrics->errors_total_malformed_encapsulation++; 1010 return TC_ACT_SHOT; 1011 } 1012 1013 if (encap->unigue.version != 0) { 1014 metrics->errors_total_malformed_encapsulation++; 1015 return TC_ACT_SHOT; 1016 } 1017 1018 if (encap->unigue.reserved != 0) { 1019 return TC_ACT_SHOT; 1020 } 1021 1022 struct in_addr next_hop; 1023 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); 1024 1025 if (next_hop.s_addr == 0) { 1026 metrics->accepted_packets_total_last_hop++; 1027 return accept_locally(skb, encap); 1028 } 1029 1030 verdict_t verdict; 1031 switch (encap->gue.proto_ctype) { 1032 case IPPROTO_IPIP: 1033 verdict = process_ipv4(&pkt, metrics); 1034 break; 1035 1036 case IPPROTO_IPV6: 1037 verdict = process_ipv6(&pkt, metrics); 1038 break; 1039 1040 default: 1041 metrics->errors_total_unknown_l3_proto++; 1042 return TC_ACT_SHOT; 1043 } 1044 1045 switch (verdict) { 1046 case INVALID: 1047 /* metrics have already been bumped */ 1048 return TC_ACT_SHOT; 1049 1050 case UNKNOWN: 1051 return forward_to_next_hop(skb, encap, &next_hop, metrics); 1052 1053 case ECHO_REQUEST: 1054 metrics->accepted_packets_total_icmp_echo_request++; 1055 break; 1056 1057 case SYN: 1058 if (encap->unigue.forward_syn) { 1059 return forward_to_next_hop(skb, encap, &next_hop, 1060 metrics); 1061 } 1062 1063 metrics->accepted_packets_total_syn++; 1064 break; 1065 1066 case SYN_COOKIE: 1067 metrics->accepted_packets_total_syn_cookies++; 1068 break; 1069 1070 case ESTABLISHED: 1071 metrics->accepted_packets_total_established++; 1072 break; 1073 } 1074 1075 return accept_locally(skb, encap); 1076 } 1077