xref: /linux/tools/testing/selftests/bpf/progs/test_cls_redirect.c (revision ae28ed4578e6d5a481e39c5a9827f27048661fdd)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <netinet/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "bpf_compiler.h"
24 #include "test_cls_redirect.h"
25 #include "bpf_misc.h"
26 
27 #pragma GCC diagnostic ignored "-Waddress-of-packed-member"
28 
29 #ifdef SUBPROGS
30 #define INLINING __noinline
31 #else
32 #define INLINING __always_inline
33 #endif
34 
35 #define IP_OFFSET_MASK (0x1FFF)
36 #define IP_MF (0x2000)
37 
38 char _license[] SEC("license") = "Dual BSD/GPL";
39 
40 /**
41  * Destination port and IP used for UDP encapsulation.
42  */
43 volatile const __be16 ENCAPSULATION_PORT;
44 volatile const __be32 ENCAPSULATION_IP;
45 
46 typedef struct {
47 	uint64_t processed_packets_total;
48 	uint64_t l3_protocol_packets_total_ipv4;
49 	uint64_t l3_protocol_packets_total_ipv6;
50 	uint64_t l4_protocol_packets_total_tcp;
51 	uint64_t l4_protocol_packets_total_udp;
52 	uint64_t accepted_packets_total_syn;
53 	uint64_t accepted_packets_total_syn_cookies;
54 	uint64_t accepted_packets_total_last_hop;
55 	uint64_t accepted_packets_total_icmp_echo_request;
56 	uint64_t accepted_packets_total_established;
57 	uint64_t forwarded_packets_total_gue;
58 	uint64_t forwarded_packets_total_gre;
59 
60 	uint64_t errors_total_unknown_l3_proto;
61 	uint64_t errors_total_unknown_l4_proto;
62 	uint64_t errors_total_malformed_ip;
63 	uint64_t errors_total_fragmented_ip;
64 	uint64_t errors_total_malformed_icmp;
65 	uint64_t errors_total_unwanted_icmp;
66 	uint64_t errors_total_malformed_icmp_pkt_too_big;
67 	uint64_t errors_total_malformed_tcp;
68 	uint64_t errors_total_malformed_udp;
69 	uint64_t errors_total_icmp_echo_replies;
70 	uint64_t errors_total_malformed_encapsulation;
71 	uint64_t errors_total_encap_adjust_failed;
72 	uint64_t errors_total_encap_buffer_too_small;
73 	uint64_t errors_total_redirect_loop;
74 	uint64_t errors_total_encap_mtu_violate;
75 } metrics_t;
76 
77 typedef enum {
78 	INVALID = 0,
79 	UNKNOWN,
80 	ECHO_REQUEST,
81 	SYN,
82 	SYN_COOKIE,
83 	ESTABLISHED,
84 } verdict_t;
85 
86 typedef struct {
87 	uint16_t src, dst;
88 } flow_ports_t;
89 
90 _Static_assert(
91 	sizeof(flow_ports_t) !=
92 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
93 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
94 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
95 _Static_assert(
96 	sizeof(flow_ports_t) !=
97 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
98 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
99 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
100 
101 typedef int ret_t;
102 
103 /* This is a bit of a hack. We need a return value which allows us to
104  * indicate that the regular flow of the program should continue,
105  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
106  */
107 static const ret_t CONTINUE_PROCESSING = -1;
108 
109 /* Convenience macro to call functions which return ret_t.
110  */
111 #define MAYBE_RETURN(x)                           \
112 	do {                                      \
113 		ret_t __ret = x;                  \
114 		if (__ret != CONTINUE_PROCESSING) \
115 			return __ret;             \
116 	} while (0)
117 
118 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
119  * or not aligned if the arch supports efficient unaligned access.
120  *
121  * Since the verifier ensures that eBPF packet accesses follow these rules,
122  * we can tell LLVM to emit code as if we always had a larger alignment.
123  * It will yell at us if we end up on a platform where this is not valid.
124  */
125 typedef uint8_t *net_ptr __attribute__((align_value(8)));
126 
127 typedef struct buf {
128 	struct __sk_buff *skb;
129 	net_ptr head;
130 	/* NB: tail mustn't have alignment other than 1, otherwise
131 	* LLVM will go and eliminate code, e.g. when checking packet lengths.
132 	*/
133 	uint8_t *const tail;
134 } buf_t;
135 
136 static __always_inline size_t buf_off(const buf_t *buf)
137 {
138 	/* Clang seems to optimize constructs like
139 	 *    a - b + c
140 	 * if c is known:
141 	 *    r? = c
142 	 *    r? -= b
143 	 *    r? += a
144 	 *
145 	 * This is a problem if a and b are packet pointers,
146 	 * since the verifier allows subtracting two pointers to
147 	 * get a scalar, but not a scalar and a pointer.
148 	 *
149 	 * Use inline asm to break this optimization.
150 	 */
151 	size_t off = (size_t)buf->head;
152 	asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
153 	return off;
154 }
155 
156 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
157 {
158 	if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
159 		return false;
160 	}
161 
162 	buf->head += len;
163 	return true;
164 }
165 
166 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
167 {
168 	/* Check whether off + len is valid in the non-linear part. */
169 	if (buf_off(buf) + len > buf->skb->len) {
170 		return false;
171 	}
172 
173 	buf->head += len;
174 	return true;
175 }
176 
177 /* Returns a pointer to the start of buf, or NULL if len is
178  * larger than the remaining data. Consumes len bytes on a successful
179  * call.
180  *
181  * If scratch is not NULL, the function will attempt to load non-linear
182  * data via bpf_skb_load_bytes. On success, scratch is returned.
183  */
184 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
185 {
186 	if (buf->head + len > buf->tail) {
187 		if (scratch == NULL) {
188 			return NULL;
189 		}
190 
191 		return buf_copy(buf, scratch, len) ? scratch : NULL;
192 	}
193 
194 	void *ptr = buf->head;
195 	buf->head += len;
196 	return ptr;
197 }
198 
199 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
200 {
201 	if (ipv4->ihl <= 5) {
202 		return true;
203 	}
204 
205 	return buf_skip(buf, (ipv4->ihl - 5) * 4);
206 }
207 
208 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
209 {
210 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
211 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
212 }
213 
214 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
215 {
216 	struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
217 	if (ipv4 == NULL) {
218 		return NULL;
219 	}
220 
221 	if (ipv4->ihl < 5) {
222 		return NULL;
223 	}
224 
225 	if (!pkt_skip_ipv4_options(pkt, ipv4)) {
226 		return NULL;
227 	}
228 
229 	return ipv4;
230 }
231 
232 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
233 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
234 {
235 	if (!buf_copy(pkt, ports, sizeof(*ports))) {
236 		return false;
237 	}
238 
239 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
240 	 * payload which is going towards the eyeball.
241 	 */
242 	uint16_t dst = ports->src;
243 	ports->src = ports->dst;
244 	ports->dst = dst;
245 	return true;
246 }
247 
248 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
249 {
250 	/* The highest reasonable value for an IPv4 header
251 	 * checksum requires two folds, so we just do that always.
252 	 */
253 	csum = (csum & 0xffff) + (csum >> 16);
254 	csum = (csum & 0xffff) + (csum >> 16);
255 	return (uint16_t)~csum;
256 }
257 
258 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
259 {
260 	iph->check = 0;
261 
262 	/* An IP header without options is 20 bytes. Two of those
263 	 * are the checksum, which we always set to zero. Hence,
264 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
265 	 * which fits in 32 bit.
266 	 */
267 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
268 	uint32_t acc = 0;
269 	uint16_t *ipw = (uint16_t *)iph;
270 
271 	__pragma_loop_unroll_full
272 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
273 		acc += ipw[i];
274 	}
275 
276 	iph->check = pkt_checksum_fold(acc);
277 }
278 
279 static INLINING
280 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
281 				     const struct ipv6hdr *ipv6,
282 				     uint8_t *upper_proto,
283 				     bool *is_fragment)
284 {
285 	/* We understand five extension headers.
286 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
287 	 * headers should occur once, except Destination Options, which may
288 	 * occur twice. Hence we give up after 6 headers.
289 	 */
290 	struct {
291 		uint8_t next;
292 		uint8_t len;
293 	} exthdr = {
294 		.next = ipv6->nexthdr,
295 	};
296 	*is_fragment = false;
297 
298 	__pragma_loop_unroll_full
299 	for (int i = 0; i < 6; i++) {
300 		switch (exthdr.next) {
301 		case IPPROTO_FRAGMENT:
302 			*is_fragment = true;
303 			/* NB: We don't check that hdrlen == 0 as per spec. */
304 			/* fallthrough; */
305 
306 		case IPPROTO_HOPOPTS:
307 		case IPPROTO_ROUTING:
308 		case IPPROTO_DSTOPTS:
309 		case IPPROTO_MH:
310 			if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
311 				return false;
312 			}
313 
314 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
315 			if (!buf_skip(pkt,
316 				      (exthdr.len + 1) * 8 - sizeof(exthdr))) {
317 				return false;
318 			}
319 
320 			/* Decode next header */
321 			break;
322 
323 		default:
324 			/* The next header is not one of the known extension
325 			 * headers, treat it as the upper layer header.
326 			 *
327 			 * This handles IPPROTO_NONE.
328 			 *
329 			 * Encapsulating Security Payload (50) and Authentication
330 			 * Header (51) also end up here (and will trigger an
331 			 * unknown proto error later). They have a custom header
332 			 * format and seem too esoteric to care about.
333 			 */
334 			*upper_proto = exthdr.next;
335 			return true;
336 		}
337 	}
338 
339 	/* We never found an upper layer header. */
340 	return false;
341 }
342 
343 /* This function has to be inlined, because the verifier otherwise rejects it
344  * due to returning a pointer to the stack. This is technically correct, since
345  * scratch is allocated on the stack. However, this usage should be safe since
346  * it's the callers stack after all.
347  */
348 static __always_inline struct ipv6hdr *
349 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
350 	       bool *is_fragment)
351 {
352 	struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
353 	if (ipv6 == NULL) {
354 		return NULL;
355 	}
356 
357 	if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
358 		return NULL;
359 	}
360 
361 	return ipv6;
362 }
363 
364 /* Global metrics, per CPU
365  */
366 struct {
367 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
368 	__uint(max_entries, 1);
369 	__type(key, unsigned int);
370 	__type(value, metrics_t);
371 } metrics_map SEC(".maps");
372 
373 static INLINING metrics_t *get_global_metrics(void)
374 {
375 	uint64_t key = 0;
376 	return bpf_map_lookup_elem(&metrics_map, &key);
377 }
378 
379 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
380 {
381 	const int payload_off =
382 		sizeof(*encap) +
383 		sizeof(struct in_addr) * encap->unigue.hop_count;
384 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
385 
386 	// Changing the ethertype if the encapsulated packet is ipv6
387 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
388 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
389 	}
390 
391 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
392 				BPF_F_ADJ_ROOM_FIXED_GSO |
393 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
394 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
395 		return TC_ACT_SHOT;
396 
397 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
398 }
399 
400 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
401 				       struct in_addr *next_hop, metrics_t *metrics)
402 {
403 	metrics->forwarded_packets_total_gre++;
404 
405 	const int payload_off =
406 		sizeof(*encap) +
407 		sizeof(struct in_addr) * encap->unigue.hop_count;
408 	int32_t encap_overhead =
409 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
410 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
411 	uint16_t proto = ETH_P_IP;
412 	uint32_t mtu_len = 0;
413 
414 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
415 	 * against any forwarding loop. As the only interesting field is the TTL
416 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
417 	 * as they handle the split packets if needed (no need for the data to be
418 	 * in the linear section).
419 	 */
420 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
421 		proto = ETH_P_IPV6;
422 		uint8_t ttl;
423 		int rc;
424 
425 		rc = bpf_skb_load_bytes(
426 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
427 			&ttl, 1);
428 		if (rc != 0) {
429 			metrics->errors_total_malformed_encapsulation++;
430 			return TC_ACT_SHOT;
431 		}
432 
433 		if (ttl == 0) {
434 			metrics->errors_total_redirect_loop++;
435 			return TC_ACT_SHOT;
436 		}
437 
438 		ttl--;
439 		rc = bpf_skb_store_bytes(
440 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
441 			&ttl, 1, 0);
442 		if (rc != 0) {
443 			metrics->errors_total_malformed_encapsulation++;
444 			return TC_ACT_SHOT;
445 		}
446 	} else {
447 		uint8_t ttl;
448 		int rc;
449 
450 		rc = bpf_skb_load_bytes(
451 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
452 			1);
453 		if (rc != 0) {
454 			metrics->errors_total_malformed_encapsulation++;
455 			return TC_ACT_SHOT;
456 		}
457 
458 		if (ttl == 0) {
459 			metrics->errors_total_redirect_loop++;
460 			return TC_ACT_SHOT;
461 		}
462 
463 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
464 		 * this function only works for 2 and 4 bytes arguments (the result is
465 		 * the same).
466 		 */
467 		rc = bpf_l3_csum_replace(
468 			skb, payload_off + offsetof(struct iphdr, check), ttl,
469 			ttl - 1, 2);
470 		if (rc != 0) {
471 			metrics->errors_total_malformed_encapsulation++;
472 			return TC_ACT_SHOT;
473 		}
474 
475 		ttl--;
476 		rc = bpf_skb_store_bytes(
477 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
478 			0);
479 		if (rc != 0) {
480 			metrics->errors_total_malformed_encapsulation++;
481 			return TC_ACT_SHOT;
482 		}
483 	}
484 
485 	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
486 		metrics->errors_total_encap_mtu_violate++;
487 		return TC_ACT_SHOT;
488 	}
489 
490 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
491 				BPF_F_ADJ_ROOM_FIXED_GSO |
492 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
493 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
494 		metrics->errors_total_encap_adjust_failed++;
495 		return TC_ACT_SHOT;
496 	}
497 
498 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
499 		metrics->errors_total_encap_buffer_too_small++;
500 		return TC_ACT_SHOT;
501 	}
502 
503 	buf_t pkt = {
504 		.skb = skb,
505 		.head = (uint8_t *)(long)skb->data,
506 		.tail = (uint8_t *)(long)skb->data_end,
507 	};
508 
509 	encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
510 	if (encap_gre == NULL) {
511 		metrics->errors_total_encap_buffer_too_small++;
512 		return TC_ACT_SHOT;
513 	}
514 
515 	encap_gre->ip.protocol = IPPROTO_GRE;
516 	encap_gre->ip.daddr = next_hop->s_addr;
517 	encap_gre->ip.saddr = ENCAPSULATION_IP;
518 	encap_gre->ip.tot_len =
519 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
520 	encap_gre->gre.flags = 0;
521 	encap_gre->gre.protocol = bpf_htons(proto);
522 	pkt_ipv4_checksum((void *)&encap_gre->ip);
523 
524 	return bpf_redirect(skb->ifindex, 0);
525 }
526 
527 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
528 					  struct in_addr *next_hop, metrics_t *metrics)
529 {
530 	/* swap L2 addresses */
531 	/* This assumes that packets are received from a router.
532 	 * So just swapping the MAC addresses here will make the packet go back to
533 	 * the router, which will send it to the appropriate machine.
534 	 */
535 	unsigned char temp[ETH_ALEN];
536 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
537 	memcpy(encap->eth.h_dest, encap->eth.h_source,
538 	       sizeof(encap->eth.h_dest));
539 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
540 
541 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
542 	    encap->unigue.last_hop_gre) {
543 		return forward_with_gre(skb, encap, next_hop, metrics);
544 	}
545 
546 	metrics->forwarded_packets_total_gue++;
547 	uint32_t old_saddr = encap->ip.saddr;
548 	encap->ip.saddr = encap->ip.daddr;
549 	encap->ip.daddr = next_hop->s_addr;
550 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
551 		encap->unigue.next_hop++;
552 	}
553 
554 	/* Remove ip->saddr, add next_hop->s_addr */
555 	const uint64_t off = offsetof(typeof(*encap), ip.check);
556 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
557 	if (ret < 0) {
558 		return TC_ACT_SHOT;
559 	}
560 
561 	return bpf_redirect(skb->ifindex, 0);
562 }
563 
564 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
565 {
566 	switch (n) {
567 	case 1:
568 		if (!buf_skip(pkt, sizeof(struct in_addr)))
569 			return TC_ACT_SHOT;
570 	case 0:
571 		return CONTINUE_PROCESSING;
572 
573 	default:
574 		return TC_ACT_SHOT;
575 	}
576 }
577 
578 /* Get the next hop from the GLB header.
579  *
580  * Sets next_hop->s_addr to 0 if there are no more hops left.
581  * pkt is positioned just after the variable length GLB header
582  * iff the call is successful.
583  */
584 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
585 				   struct in_addr *next_hop)
586 {
587 	if (encap->unigue.next_hop > encap->unigue.hop_count) {
588 		return TC_ACT_SHOT;
589 	}
590 
591 	/* Skip "used" next hops. */
592 	MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
593 
594 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
595 		/* No more next hops, we are at the end of the GLB header. */
596 		next_hop->s_addr = 0;
597 		return CONTINUE_PROCESSING;
598 	}
599 
600 	if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
601 		return TC_ACT_SHOT;
602 	}
603 
604 	/* Skip the remaining next hops (may be zero). */
605 	return skip_next_hops(pkt, encap->unigue.hop_count -
606 					   encap->unigue.next_hop - 1);
607 }
608 
609 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
610  * This is a kludge that let's us work around verifier limitations:
611  *
612  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
613  *
614  * clang will substitute a constant for sizeof, which allows the verifier
615  * to track its value. Based on this, it can figure out the constant
616  * return value, and calling code works while still being "generic" to
617  * IPv4 and IPv6.
618  */
619 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
620 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
621 {
622 	switch (iphlen) {
623 	case sizeof(struct iphdr): {
624 		struct iphdr *ipv4 = (struct iphdr *)iph;
625 		tuple->ipv4.daddr = ipv4->daddr;
626 		tuple->ipv4.saddr = ipv4->saddr;
627 		tuple->ipv4.sport = sport;
628 		tuple->ipv4.dport = dport;
629 		return sizeof(tuple->ipv4);
630 	}
631 
632 	case sizeof(struct ipv6hdr): {
633 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
634 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
635 		       sizeof(tuple->ipv6.daddr));
636 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
637 		       sizeof(tuple->ipv6.saddr));
638 		tuple->ipv6.sport = sport;
639 		tuple->ipv6.dport = dport;
640 		return sizeof(tuple->ipv6);
641 	}
642 
643 	default:
644 		return 0;
645 	}
646 }
647 
648 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
649 				       struct bpf_sock_tuple *tuple, uint64_t tuplen,
650 				       void *iph, struct tcphdr *tcp)
651 {
652 	struct bpf_sock *sk =
653 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
654 	if (sk == NULL) {
655 		return UNKNOWN;
656 	}
657 
658 	if (sk->state != BPF_TCP_LISTEN) {
659 		bpf_sk_release(sk);
660 		return ESTABLISHED;
661 	}
662 
663 	if (iph != NULL && tcp != NULL) {
664 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
665 		uint64_t iphlen = sizeof(struct iphdr);
666 		if (tuplen == sizeof(tuple->ipv6)) {
667 			iphlen = sizeof(struct ipv6hdr);
668 		}
669 
670 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
671 					    sizeof(*tcp)) == 0) {
672 			bpf_sk_release(sk);
673 			return SYN_COOKIE;
674 		}
675 	}
676 
677 	bpf_sk_release(sk);
678 	return UNKNOWN;
679 }
680 
681 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
682 				       struct bpf_sock_tuple *tuple, uint64_t tuplen)
683 {
684 	struct bpf_sock *sk =
685 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
686 	if (sk == NULL) {
687 		return UNKNOWN;
688 	}
689 
690 	if (sk->state == BPF_TCP_ESTABLISHED) {
691 		bpf_sk_release(sk);
692 		return ESTABLISHED;
693 	}
694 
695 	bpf_sk_release(sk);
696 	return UNKNOWN;
697 }
698 
699 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
700 					struct bpf_sock_tuple *tuple, uint64_t tuplen,
701 					metrics_t *metrics)
702 {
703 	switch (proto) {
704 	case IPPROTO_TCP:
705 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
706 
707 	case IPPROTO_UDP:
708 		return classify_udp(skb, tuple, tuplen);
709 
710 	default:
711 		metrics->errors_total_malformed_icmp++;
712 		return INVALID;
713 	}
714 }
715 
716 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
717 {
718 	struct icmphdr icmp;
719 	if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
720 		metrics->errors_total_malformed_icmp++;
721 		return INVALID;
722 	}
723 
724 	/* We should never receive encapsulated echo replies. */
725 	if (icmp.type == ICMP_ECHOREPLY) {
726 		metrics->errors_total_icmp_echo_replies++;
727 		return INVALID;
728 	}
729 
730 	if (icmp.type == ICMP_ECHO) {
731 		return ECHO_REQUEST;
732 	}
733 
734 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
735 		metrics->errors_total_unwanted_icmp++;
736 		return INVALID;
737 	}
738 
739 	struct iphdr _ip4;
740 	const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
741 	if (ipv4 == NULL) {
742 		metrics->errors_total_malformed_icmp_pkt_too_big++;
743 		return INVALID;
744 	}
745 
746 	/* The source address in the outer IP header is from the entity that
747 	 * originated the ICMP message. Use the original IP header to restore
748 	 * the correct flow tuple.
749 	 */
750 	struct bpf_sock_tuple tuple;
751 	tuple.ipv4.saddr = ipv4->daddr;
752 	tuple.ipv4.daddr = ipv4->saddr;
753 
754 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
755 		metrics->errors_total_malformed_icmp_pkt_too_big++;
756 		return INVALID;
757 	}
758 
759 	return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
760 			     sizeof(tuple.ipv4), metrics);
761 }
762 
763 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
764 {
765 	struct icmp6hdr icmp6;
766 	if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
767 		metrics->errors_total_malformed_icmp++;
768 		return INVALID;
769 	}
770 
771 	/* We should never receive encapsulated echo replies. */
772 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
773 		metrics->errors_total_icmp_echo_replies++;
774 		return INVALID;
775 	}
776 
777 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
778 		return ECHO_REQUEST;
779 	}
780 
781 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
782 		metrics->errors_total_unwanted_icmp++;
783 		return INVALID;
784 	}
785 
786 	bool is_fragment;
787 	uint8_t l4_proto;
788 	struct ipv6hdr _ipv6;
789 	const struct ipv6hdr *ipv6 =
790 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
791 	if (ipv6 == NULL) {
792 		metrics->errors_total_malformed_icmp_pkt_too_big++;
793 		return INVALID;
794 	}
795 
796 	if (is_fragment) {
797 		metrics->errors_total_fragmented_ip++;
798 		return INVALID;
799 	}
800 
801 	/* Swap source and dest addresses. */
802 	struct bpf_sock_tuple tuple;
803 	memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
804 	memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
805 
806 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
807 		metrics->errors_total_malformed_icmp_pkt_too_big++;
808 		return INVALID;
809 	}
810 
811 	return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
812 			     metrics);
813 }
814 
815 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
816 				      metrics_t *metrics)
817 {
818 	metrics->l4_protocol_packets_total_tcp++;
819 
820 	struct tcphdr _tcp;
821 	struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
822 	if (tcp == NULL) {
823 		metrics->errors_total_malformed_tcp++;
824 		return INVALID;
825 	}
826 
827 	if (tcp->syn) {
828 		return SYN;
829 	}
830 
831 	struct bpf_sock_tuple tuple;
832 	uint64_t tuplen =
833 		fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
834 	return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
835 }
836 
837 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
838 				      metrics_t *metrics)
839 {
840 	metrics->l4_protocol_packets_total_udp++;
841 
842 	struct udphdr _udp;
843 	struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
844 	if (udph == NULL) {
845 		metrics->errors_total_malformed_udp++;
846 		return INVALID;
847 	}
848 
849 	struct bpf_sock_tuple tuple;
850 	uint64_t tuplen =
851 		fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
852 	return classify_udp(pkt->skb, &tuple, tuplen);
853 }
854 
855 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
856 {
857 	metrics->l3_protocol_packets_total_ipv4++;
858 
859 	struct iphdr _ip4;
860 	struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
861 	if (ipv4 == NULL) {
862 		metrics->errors_total_malformed_ip++;
863 		return INVALID;
864 	}
865 
866 	if (ipv4->version != 4) {
867 		metrics->errors_total_malformed_ip++;
868 		return INVALID;
869 	}
870 
871 	if (ipv4_is_fragment(ipv4)) {
872 		metrics->errors_total_fragmented_ip++;
873 		return INVALID;
874 	}
875 
876 	switch (ipv4->protocol) {
877 	case IPPROTO_ICMP:
878 		return process_icmpv4(pkt, metrics);
879 
880 	case IPPROTO_TCP:
881 		return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
882 
883 	case IPPROTO_UDP:
884 		return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
885 
886 	default:
887 		metrics->errors_total_unknown_l4_proto++;
888 		return INVALID;
889 	}
890 }
891 
892 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
893 {
894 	metrics->l3_protocol_packets_total_ipv6++;
895 
896 	uint8_t l4_proto;
897 	bool is_fragment;
898 	struct ipv6hdr _ipv6;
899 	struct ipv6hdr *ipv6 =
900 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
901 	if (ipv6 == NULL) {
902 		metrics->errors_total_malformed_ip++;
903 		return INVALID;
904 	}
905 
906 	if (ipv6->version != 6) {
907 		metrics->errors_total_malformed_ip++;
908 		return INVALID;
909 	}
910 
911 	if (is_fragment) {
912 		metrics->errors_total_fragmented_ip++;
913 		return INVALID;
914 	}
915 
916 	switch (l4_proto) {
917 	case IPPROTO_ICMPV6:
918 		return process_icmpv6(pkt, metrics);
919 
920 	case IPPROTO_TCP:
921 		return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
922 
923 	case IPPROTO_UDP:
924 		return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
925 
926 	default:
927 		metrics->errors_total_unknown_l4_proto++;
928 		return INVALID;
929 	}
930 }
931 
932 SEC("tc")
933 int cls_redirect(struct __sk_buff *skb)
934 {
935 	metrics_t *metrics = get_global_metrics();
936 	if (metrics == NULL) {
937 		return TC_ACT_SHOT;
938 	}
939 
940 	metrics->processed_packets_total++;
941 
942 	/* Pass bogus packets as long as we're not sure they're
943 	 * destined for us.
944 	 */
945 	if (skb->protocol != bpf_htons(ETH_P_IP)) {
946 		return TC_ACT_OK;
947 	}
948 
949 	encap_headers_t *encap;
950 
951 	/* Make sure that all encapsulation headers are available in
952 	 * the linear portion of the skb. This makes it easy to manipulate them.
953 	 */
954 	if (bpf_skb_pull_data(skb, sizeof(*encap))) {
955 		return TC_ACT_OK;
956 	}
957 
958 	buf_t pkt = {
959 		.skb = skb,
960 		.head = (uint8_t *)(long)skb->data,
961 		.tail = (uint8_t *)(long)skb->data_end,
962 	};
963 
964 	encap = buf_assign(&pkt, sizeof(*encap), NULL);
965 	if (encap == NULL) {
966 		return TC_ACT_OK;
967 	}
968 
969 	if (encap->ip.ihl != 5) {
970 		/* We never have any options. */
971 		return TC_ACT_OK;
972 	}
973 
974 	if (encap->ip.daddr != ENCAPSULATION_IP ||
975 	    encap->ip.protocol != IPPROTO_UDP) {
976 		return TC_ACT_OK;
977 	}
978 
979 	/* TODO Check UDP length? */
980 	if (encap->udp.dest != ENCAPSULATION_PORT) {
981 		return TC_ACT_OK;
982 	}
983 
984 	/* We now know that the packet is destined to us, we can
985 	 * drop bogus ones.
986 	 */
987 	if (ipv4_is_fragment((void *)&encap->ip)) {
988 		metrics->errors_total_fragmented_ip++;
989 		return TC_ACT_SHOT;
990 	}
991 
992 	if (encap->gue.variant != 0) {
993 		metrics->errors_total_malformed_encapsulation++;
994 		return TC_ACT_SHOT;
995 	}
996 
997 	if (encap->gue.control != 0) {
998 		metrics->errors_total_malformed_encapsulation++;
999 		return TC_ACT_SHOT;
1000 	}
1001 
1002 	if (encap->gue.flags != 0) {
1003 		metrics->errors_total_malformed_encapsulation++;
1004 		return TC_ACT_SHOT;
1005 	}
1006 
1007 	if (encap->gue.hlen !=
1008 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1009 		metrics->errors_total_malformed_encapsulation++;
1010 		return TC_ACT_SHOT;
1011 	}
1012 
1013 	if (encap->unigue.version != 0) {
1014 		metrics->errors_total_malformed_encapsulation++;
1015 		return TC_ACT_SHOT;
1016 	}
1017 
1018 	if (encap->unigue.reserved != 0) {
1019 		return TC_ACT_SHOT;
1020 	}
1021 
1022 	struct in_addr next_hop;
1023 	MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1024 
1025 	if (next_hop.s_addr == 0) {
1026 		metrics->accepted_packets_total_last_hop++;
1027 		return accept_locally(skb, encap);
1028 	}
1029 
1030 	verdict_t verdict;
1031 	switch (encap->gue.proto_ctype) {
1032 	case IPPROTO_IPIP:
1033 		verdict = process_ipv4(&pkt, metrics);
1034 		break;
1035 
1036 	case IPPROTO_IPV6:
1037 		verdict = process_ipv6(&pkt, metrics);
1038 		break;
1039 
1040 	default:
1041 		metrics->errors_total_unknown_l3_proto++;
1042 		return TC_ACT_SHOT;
1043 	}
1044 
1045 	switch (verdict) {
1046 	case INVALID:
1047 		/* metrics have already been bumped */
1048 		return TC_ACT_SHOT;
1049 
1050 	case UNKNOWN:
1051 		return forward_to_next_hop(skb, encap, &next_hop, metrics);
1052 
1053 	case ECHO_REQUEST:
1054 		metrics->accepted_packets_total_icmp_echo_request++;
1055 		break;
1056 
1057 	case SYN:
1058 		if (encap->unigue.forward_syn) {
1059 			return forward_to_next_hop(skb, encap, &next_hop,
1060 						   metrics);
1061 		}
1062 
1063 		metrics->accepted_packets_total_syn++;
1064 		break;
1065 
1066 	case SYN_COOKIE:
1067 		metrics->accepted_packets_total_syn_cookies++;
1068 		break;
1069 
1070 	case ESTABLISHED:
1071 		metrics->accepted_packets_total_established++;
1072 		break;
1073 	}
1074 
1075 	return accept_locally(skb, encap);
1076 }
1077