xref: /linux/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c (revision c435bce6af9b2a277662698875a689c389358f17)
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #include "vmlinux.h"
5 
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_endian.h>
8 #include <asm/errno.h>
9 
10 #define TC_ACT_OK 0
11 #define TC_ACT_SHOT 2
12 
13 #define NSEC_PER_SEC 1000000000L
14 
15 #define ETH_ALEN 6
16 #define ETH_P_IP 0x0800
17 #define ETH_P_IPV6 0x86DD
18 
19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
20 
21 #define IP_DF 0x4000
22 #define IP_MF 0x2000
23 #define IP_OFFSET 0x1fff
24 
25 #define NEXTHDR_TCP 6
26 
27 #define TCPOPT_NOP 1
28 #define TCPOPT_EOL 0
29 #define TCPOPT_MSS 2
30 #define TCPOPT_WINDOW 3
31 #define TCPOPT_SACK_PERM 4
32 #define TCPOPT_TIMESTAMP 8
33 
34 #define TCPOLEN_MSS 4
35 #define TCPOLEN_WINDOW 3
36 #define TCPOLEN_SACK_PERM 2
37 #define TCPOLEN_TIMESTAMP 10
38 
39 #define TCP_TS_HZ 1000
40 #define TS_OPT_WSCALE_MASK 0xf
41 #define TS_OPT_SACK (1 << 4)
42 #define TS_OPT_ECN (1 << 5)
43 #define TSBITS 6
44 #define TSMASK (((__u32)1 << TSBITS) - 1)
45 #define TCP_MAX_WSCALE 14U
46 
47 #define IPV4_MAXLEN 60
48 #define TCP_MAXLEN 60
49 
50 #define DEFAULT_MSS4 1460
51 #define DEFAULT_MSS6 1440
52 #define DEFAULT_WSCALE 7
53 #define DEFAULT_TTL 64
54 #define MAX_ALLOWED_PORTS 8
55 
56 #define MAX_PACKET_OFF 0xffff
57 
58 #define swap(a, b) \
59 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
60 
61 #define __get_unaligned_t(type, ptr) ({						\
62 	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
63 	__pptr->x;								\
64 })
65 
66 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
67 
68 struct {
69 	__uint(type, BPF_MAP_TYPE_ARRAY);
70 	__type(key, __u32);
71 	__type(value, __u64);
72 	__uint(max_entries, 2);
73 } values SEC(".maps");
74 
75 struct {
76 	__uint(type, BPF_MAP_TYPE_ARRAY);
77 	__type(key, __u32);
78 	__type(value, __u16);
79 	__uint(max_entries, MAX_ALLOWED_PORTS);
80 } allowed_ports SEC(".maps");
81 
82 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
83  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
84  */
85 
86 struct bpf_ct_opts___local {
87 	s32 netns_id;
88 	s32 error;
89 	u8 l4proto;
90 	u8 dir;
91 	u8 reserved[2];
92 } __attribute__((preserve_access_index));
93 
94 #define BPF_F_CURRENT_NETNS (-1)
95 
96 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
97 					 struct bpf_sock_tuple *bpf_tuple,
98 					 __u32 len_tuple,
99 					 struct bpf_ct_opts___local *opts,
100 					 __u32 len_opts) __ksym;
101 
102 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
103 					 struct bpf_sock_tuple *bpf_tuple,
104 					 u32 len_tuple,
105 					 struct bpf_ct_opts___local *opts,
106 					 u32 len_opts) __ksym;
107 
108 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
109 
110 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
111 {
112 	__u8 tmp[ETH_ALEN];
113 
114 	__builtin_memcpy(tmp, a, ETH_ALEN);
115 	__builtin_memcpy(a, b, ETH_ALEN);
116 	__builtin_memcpy(b, tmp, ETH_ALEN);
117 }
118 
119 static __always_inline __u16 csum_fold(__u32 csum)
120 {
121 	csum = (csum & 0xffff) + (csum >> 16);
122 	csum = (csum & 0xffff) + (csum >> 16);
123 	return (__u16)~csum;
124 }
125 
126 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
127 					       __u32 len, __u8 proto,
128 					       __u32 csum)
129 {
130 	__u64 s = csum;
131 
132 	s += (__u32)saddr;
133 	s += (__u32)daddr;
134 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
135 	s += proto + len;
136 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
137 	s += (proto + len) << 8;
138 #else
139 #error Unknown endian
140 #endif
141 	s = (s & 0xffffffff) + (s >> 32);
142 	s = (s & 0xffffffff) + (s >> 32);
143 
144 	return csum_fold((__u32)s);
145 }
146 
147 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
148 					     const struct in6_addr *daddr,
149 					     __u32 len, __u8 proto, __u32 csum)
150 {
151 	__u64 sum = csum;
152 	int i;
153 
154 #pragma unroll
155 	for (i = 0; i < 4; i++)
156 		sum += (__u32)saddr->in6_u.u6_addr32[i];
157 
158 #pragma unroll
159 	for (i = 0; i < 4; i++)
160 		sum += (__u32)daddr->in6_u.u6_addr32[i];
161 
162 	/* Don't combine additions to avoid 32-bit overflow. */
163 	sum += bpf_htonl(len);
164 	sum += bpf_htonl(proto);
165 
166 	sum = (sum & 0xffffffff) + (sum >> 32);
167 	sum = (sum & 0xffffffff) + (sum >> 32);
168 
169 	return csum_fold((__u32)sum);
170 }
171 
172 static __always_inline __u64 tcp_clock_ns(void)
173 {
174 	return bpf_ktime_get_ns();
175 }
176 
177 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
178 {
179 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
180 }
181 
182 static __always_inline __u32 tcp_clock_ms(void)
183 {
184 	return tcp_ns_to_ts(tcp_clock_ns());
185 }
186 
187 struct tcpopt_context {
188 	void *data;
189 	void *data_end;
190 	__be32 *tsecr;
191 	__u8 wscale;
192 	bool option_timestamp;
193 	bool option_sack;
194 	__u32 off;
195 };
196 
197 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
198 {
199 	__u64 off = ctx->off;
200 	__u8 *data;
201 
202 	/* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
203 	if (off > MAX_PACKET_OFF - sz)
204 		return NULL;
205 
206 	data = ctx->data + off;
207 	barrier_var(data);
208 	if (data + sz >= ctx->data_end)
209 		return NULL;
210 
211 	ctx->off += sz;
212 	return data;
213 }
214 
215 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
216 {
217 	__u8 *opcode, *opsize, *wscale, *tsecr;
218 	__u32 off = ctx->off;
219 
220 	opcode = next(ctx, 1);
221 	if (!opcode)
222 		return 1;
223 
224 	if (*opcode == TCPOPT_EOL)
225 		return 1;
226 	if (*opcode == TCPOPT_NOP)
227 		return 0;
228 
229 	opsize = next(ctx, 1);
230 	if (!opsize || *opsize < 2)
231 		return 1;
232 
233 	switch (*opcode) {
234 	case TCPOPT_WINDOW:
235 		wscale = next(ctx, 1);
236 		if (!wscale)
237 			return 1;
238 		if (*opsize == TCPOLEN_WINDOW)
239 			ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
240 		break;
241 	case TCPOPT_TIMESTAMP:
242 		tsecr = next(ctx, 4);
243 		if (!tsecr)
244 			return 1;
245 		if (*opsize == TCPOLEN_TIMESTAMP) {
246 			ctx->option_timestamp = true;
247 			/* Client's tsval becomes our tsecr. */
248 			*ctx->tsecr = get_unaligned((__be32 *)tsecr);
249 		}
250 		break;
251 	case TCPOPT_SACK_PERM:
252 		if (*opsize == TCPOLEN_SACK_PERM)
253 			ctx->option_sack = true;
254 		break;
255 	}
256 
257 	ctx->off = off + *opsize;
258 
259 	return 0;
260 }
261 
262 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
263 {
264 	int i;
265 
266 	for (i = 0; i < 7; i++)
267 		if (tscookie_tcpopt_parse(context))
268 			return 1;
269 	return 0;
270 }
271 
272 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
273 					  __u16 tcp_len, __be32 *tsval,
274 					  __be32 *tsecr, void *data, void *data_end)
275 {
276 	struct tcpopt_context loop_ctx = {
277 		.data = data,
278 		.data_end = data_end,
279 		.tsecr = tsecr,
280 		.wscale = TS_OPT_WSCALE_MASK,
281 		.option_timestamp = false,
282 		.option_sack = false,
283 		/* Note: currently verifier would track .off as unbound scalar.
284 		 *       In case if verifier would at some point get smarter and
285 		 *       compute bounded value for this var, beware that it might
286 		 *       hinder bpf_loop() convergence validation.
287 		 */
288 		.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
289 	};
290 	u32 cookie;
291 
292 	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
293 
294 	if (!loop_ctx.option_timestamp)
295 		return false;
296 
297 	cookie = tcp_clock_ms() & ~TSMASK;
298 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
299 	if (loop_ctx.option_sack)
300 		cookie |= TS_OPT_SACK;
301 	if (tcp_header->ece && tcp_header->cwr)
302 		cookie |= TS_OPT_ECN;
303 	*tsval = bpf_htonl(cookie);
304 
305 	return true;
306 }
307 
308 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
309 						 __u8 *ttl, bool ipv6)
310 {
311 	__u32 key = 0;
312 	__u64 *value;
313 
314 	value = bpf_map_lookup_elem(&values, &key);
315 	if (value && *value != 0) {
316 		if (ipv6)
317 			*mss = (*value >> 32) & 0xffff;
318 		else
319 			*mss = *value & 0xffff;
320 		*wscale = (*value >> 16) & 0xf;
321 		*ttl = (*value >> 24) & 0xff;
322 		return;
323 	}
324 
325 	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
326 	*wscale = DEFAULT_WSCALE;
327 	*ttl = DEFAULT_TTL;
328 }
329 
330 static __always_inline void values_inc_synacks(void)
331 {
332 	__u32 key = 1;
333 	__u64 *value;
334 
335 	value = bpf_map_lookup_elem(&values, &key);
336 	if (value)
337 		__sync_fetch_and_add(value, 1);
338 }
339 
340 static __always_inline bool check_port_allowed(__u16 port)
341 {
342 	__u32 i;
343 
344 	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
345 		__u32 key = i;
346 		__u16 *value;
347 
348 		value = bpf_map_lookup_elem(&allowed_ports, &key);
349 
350 		if (!value)
351 			break;
352 		/* 0 is a terminator value. Check it first to avoid matching on
353 		 * a forbidden port == 0 and returning true.
354 		 */
355 		if (*value == 0)
356 			break;
357 
358 		if (*value == port)
359 			return true;
360 	}
361 
362 	return false;
363 }
364 
365 struct header_pointers {
366 	struct ethhdr *eth;
367 	struct iphdr *ipv4;
368 	struct ipv6hdr *ipv6;
369 	struct tcphdr *tcp;
370 	__u16 tcp_len;
371 };
372 
373 static __always_inline int tcp_dissect(void *data, void *data_end,
374 				       struct header_pointers *hdr)
375 {
376 	hdr->eth = data;
377 	if (hdr->eth + 1 > data_end)
378 		return XDP_DROP;
379 
380 	switch (bpf_ntohs(hdr->eth->h_proto)) {
381 	case ETH_P_IP:
382 		hdr->ipv6 = NULL;
383 
384 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
385 		if (hdr->ipv4 + 1 > data_end)
386 			return XDP_DROP;
387 		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
388 			return XDP_DROP;
389 		if (hdr->ipv4->version != 4)
390 			return XDP_DROP;
391 
392 		if (hdr->ipv4->protocol != IPPROTO_TCP)
393 			return XDP_PASS;
394 
395 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
396 		break;
397 	case ETH_P_IPV6:
398 		hdr->ipv4 = NULL;
399 
400 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
401 		if (hdr->ipv6 + 1 > data_end)
402 			return XDP_DROP;
403 		if (hdr->ipv6->version != 6)
404 			return XDP_DROP;
405 
406 		/* XXX: Extension headers are not supported and could circumvent
407 		 * XDP SYN flood protection.
408 		 */
409 		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
410 			return XDP_PASS;
411 
412 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
413 		break;
414 	default:
415 		/* XXX: VLANs will circumvent XDP SYN flood protection. */
416 		return XDP_PASS;
417 	}
418 
419 	if (hdr->tcp + 1 > data_end)
420 		return XDP_DROP;
421 	hdr->tcp_len = hdr->tcp->doff * 4;
422 	if (hdr->tcp_len < sizeof(*hdr->tcp))
423 		return XDP_DROP;
424 
425 	return XDP_TX;
426 }
427 
428 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
429 {
430 	struct bpf_ct_opts___local ct_lookup_opts = {
431 		.netns_id = BPF_F_CURRENT_NETNS,
432 		.l4proto = IPPROTO_TCP,
433 	};
434 	struct bpf_sock_tuple tup = {};
435 	struct nf_conn *ct;
436 	__u32 tup_size;
437 
438 	if (hdr->ipv4) {
439 		/* TCP doesn't normally use fragments, and XDP can't reassemble
440 		 * them.
441 		 */
442 		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
443 			return XDP_DROP;
444 
445 		tup.ipv4.saddr = hdr->ipv4->saddr;
446 		tup.ipv4.daddr = hdr->ipv4->daddr;
447 		tup.ipv4.sport = hdr->tcp->source;
448 		tup.ipv4.dport = hdr->tcp->dest;
449 		tup_size = sizeof(tup.ipv4);
450 	} else if (hdr->ipv6) {
451 		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
452 		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
453 		tup.ipv6.sport = hdr->tcp->source;
454 		tup.ipv6.dport = hdr->tcp->dest;
455 		tup_size = sizeof(tup.ipv6);
456 	} else {
457 		/* The verifier can't track that either ipv4 or ipv6 is not
458 		 * NULL.
459 		 */
460 		return XDP_ABORTED;
461 	}
462 	if (xdp)
463 		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
464 	else
465 		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466 	if (ct) {
467 		unsigned long status = ct->status;
468 
469 		bpf_ct_release(ct);
470 		if (status & IPS_CONFIRMED)
471 			return XDP_PASS;
472 	} else if (ct_lookup_opts.error != -ENOENT) {
473 		return XDP_ABORTED;
474 	}
475 
476 	/* error == -ENOENT || !(status & IPS_CONFIRMED) */
477 	return XDP_TX;
478 }
479 
480 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
481 					  __u8 wscale)
482 {
483 	__be32 *start = buf;
484 
485 	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
486 
487 	if (!tsopt)
488 		return buf - start;
489 
490 	if (tsopt[0] & bpf_htonl(1 << 4))
491 		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
492 				   (TCPOLEN_SACK_PERM << 16) |
493 				   (TCPOPT_TIMESTAMP << 8) |
494 				   TCPOLEN_TIMESTAMP);
495 	else
496 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
497 				   (TCPOPT_NOP << 16) |
498 				   (TCPOPT_TIMESTAMP << 8) |
499 				   TCPOLEN_TIMESTAMP);
500 	*buf++ = tsopt[0];
501 	*buf++ = tsopt[1];
502 
503 	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
504 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
505 				   (TCPOPT_WINDOW << 16) |
506 				   (TCPOLEN_WINDOW << 8) |
507 				   wscale);
508 
509 	return buf - start;
510 }
511 
512 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
513 					   __u32 cookie, __be32 *tsopt,
514 					   __u16 mss, __u8 wscale)
515 {
516 	void *tcp_options;
517 
518 	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
519 	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
520 		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
521 	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
522 	swap(tcp_header->source, tcp_header->dest);
523 	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
524 	tcp_header->seq = bpf_htonl(cookie);
525 	tcp_header->window = 0;
526 	tcp_header->urg_ptr = 0;
527 	tcp_header->check = 0; /* Calculate checksum later. */
528 
529 	tcp_options = (void *)(tcp_header + 1);
530 	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
531 }
532 
533 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
534 					     __u32 cookie, __be32 *tsopt)
535 {
536 	__u8 wscale;
537 	__u16 mss;
538 	__u8 ttl;
539 
540 	values_get_tcpipopts(&mss, &wscale, &ttl, false);
541 
542 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
543 
544 	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
545 	hdr->ipv4->check = 0; /* Calculate checksum later. */
546 	hdr->ipv4->tos = 0;
547 	hdr->ipv4->id = 0;
548 	hdr->ipv4->ttl = ttl;
549 
550 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
551 
552 	hdr->tcp_len = hdr->tcp->doff * 4;
553 	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
554 }
555 
556 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
557 					     __u32 cookie, __be32 *tsopt)
558 {
559 	__u8 wscale;
560 	__u16 mss;
561 	__u8 ttl;
562 
563 	values_get_tcpipopts(&mss, &wscale, &ttl, true);
564 
565 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
566 
567 	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
568 	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
569 	hdr->ipv6->hop_limit = ttl;
570 
571 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
572 
573 	hdr->tcp_len = hdr->tcp->doff * 4;
574 	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
575 }
576 
577 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
578 						void *ctx,
579 						void *data, void *data_end,
580 						bool xdp)
581 {
582 	__u32 old_pkt_size, new_pkt_size;
583 	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
584 	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
585 	 * the pointer value and use it directly, otherwise tcp_mkoptions is
586 	 * (mis)compiled like this:
587 	 *   if (!tsopt)
588 	 *       return buf - start;
589 	 *   reg = stored_return_value_of_tscookie_init;
590 	 *   if (reg)
591 	 *       tsopt = tsopt_buf;
592 	 *   else
593 	 *       tsopt = NULL;
594 	 *   ...
595 	 *   *buf++ = tsopt[1];
596 	 * It creates a dead branch where tsopt is assigned NULL, but the
597 	 * verifier can't prove it's dead and blocks the program.
598 	 */
599 	__be32 * volatile tsopt = NULL;
600 	__be32 tsopt_buf[2] = {};
601 	__u16 ip_len;
602 	__u32 cookie;
603 	__s64 value;
604 
605 	/* Checksum is not yet verified, but both checksum failure and TCP
606 	 * header checks return XDP_DROP, so the order doesn't matter.
607 	 */
608 	if (hdr->tcp->fin || hdr->tcp->rst)
609 		return XDP_DROP;
610 
611 	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
612 	 * ports.
613 	 */
614 	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
615 		return XDP_DROP;
616 
617 	if (hdr->ipv4) {
618 		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
619 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
620 		if (value < 0)
621 			return XDP_ABORTED;
622 		if (csum_fold(value) != 0)
623 			return XDP_DROP; /* Bad IPv4 checksum. */
624 
625 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
626 		if (value < 0)
627 			return XDP_ABORTED;
628 		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
629 				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
630 			return XDP_DROP; /* Bad TCP checksum. */
631 
632 		ip_len = sizeof(*hdr->ipv4);
633 
634 		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
635 						       hdr->tcp_len);
636 	} else if (hdr->ipv6) {
637 		/* Check the TCP checksum before creating a SYNACK. */
638 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
639 		if (value < 0)
640 			return XDP_ABORTED;
641 		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
642 				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
643 			return XDP_DROP; /* Bad TCP checksum. */
644 
645 		ip_len = sizeof(*hdr->ipv6);
646 
647 		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
648 						       hdr->tcp_len);
649 	} else {
650 		return XDP_ABORTED;
651 	}
652 
653 	if (value < 0)
654 		return XDP_ABORTED;
655 	cookie = (__u32)value;
656 
657 	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
658 			  &tsopt_buf[0], &tsopt_buf[1], data, data_end))
659 		tsopt = tsopt_buf;
660 
661 	/* Check that there is enough space for a SYNACK. It also covers
662 	 * the check that the destination of the __builtin_memmove below
663 	 * doesn't overflow.
664 	 */
665 	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
666 		return XDP_ABORTED;
667 
668 	if (hdr->ipv4) {
669 		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
670 			struct tcphdr *new_tcp_header;
671 
672 			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
673 			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
674 			hdr->tcp = new_tcp_header;
675 
676 			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
677 		}
678 
679 		tcpv4_gen_synack(hdr, cookie, tsopt);
680 	} else if (hdr->ipv6) {
681 		tcpv6_gen_synack(hdr, cookie, tsopt);
682 	} else {
683 		return XDP_ABORTED;
684 	}
685 
686 	/* Recalculate checksums. */
687 	hdr->tcp->check = 0;
688 	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
689 	if (value < 0)
690 		return XDP_ABORTED;
691 	if (hdr->ipv4) {
692 		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
693 						    hdr->ipv4->daddr,
694 						    hdr->tcp_len,
695 						    IPPROTO_TCP,
696 						    value);
697 
698 		hdr->ipv4->check = 0;
699 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
700 		if (value < 0)
701 			return XDP_ABORTED;
702 		hdr->ipv4->check = csum_fold(value);
703 	} else if (hdr->ipv6) {
704 		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
705 						  &hdr->ipv6->daddr,
706 						  hdr->tcp_len,
707 						  IPPROTO_TCP,
708 						  value);
709 	} else {
710 		return XDP_ABORTED;
711 	}
712 
713 	/* Set the new packet size. */
714 	old_pkt_size = data_end - data;
715 	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
716 	if (xdp) {
717 		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
718 			return XDP_ABORTED;
719 	} else {
720 		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
721 			return XDP_ABORTED;
722 	}
723 
724 	values_inc_synacks();
725 
726 	return XDP_TX;
727 }
728 
729 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
730 {
731 	int err;
732 
733 	if (hdr->tcp->rst)
734 		return XDP_DROP;
735 
736 	if (hdr->ipv4)
737 		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
738 	else if (hdr->ipv6)
739 		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
740 	else
741 		return XDP_ABORTED;
742 	if (err)
743 		return XDP_DROP;
744 
745 	return XDP_PASS;
746 }
747 
748 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
749 					   struct header_pointers *hdr, bool xdp)
750 {
751 	int ret;
752 
753 	ret = tcp_dissect(data, data_end, hdr);
754 	if (ret != XDP_TX)
755 		return ret;
756 
757 	ret = tcp_lookup(ctx, hdr, xdp);
758 	if (ret != XDP_TX)
759 		return ret;
760 
761 	/* Packet is TCP and doesn't belong to an established connection. */
762 
763 	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
764 		return XDP_DROP;
765 
766 	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
767 	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
768 	 */
769 	if (xdp) {
770 		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
771 			return XDP_ABORTED;
772 	} else {
773 		/* Without volatile the verifier throws this error:
774 		 * R9 32-bit pointer arithmetic prohibited
775 		 */
776 		volatile u64 old_len = data_end - data;
777 
778 		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
779 			return XDP_ABORTED;
780 	}
781 
782 	return XDP_TX;
783 }
784 
785 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
786 					   struct header_pointers *hdr, bool xdp)
787 {
788 	if (hdr->ipv4) {
789 		hdr->eth = data;
790 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
791 		/* IPV4_MAXLEN is needed when calculating checksum.
792 		 * At least sizeof(struct iphdr) is needed here to access ihl.
793 		 */
794 		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
795 			return XDP_ABORTED;
796 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
797 	} else if (hdr->ipv6) {
798 		hdr->eth = data;
799 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
800 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
801 	} else {
802 		return XDP_ABORTED;
803 	}
804 
805 	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
806 		return XDP_ABORTED;
807 
808 	/* We run out of registers, tcp_len gets spilled to the stack, and the
809 	 * verifier forgets its min and max values checked above in tcp_dissect.
810 	 */
811 	hdr->tcp_len = hdr->tcp->doff * 4;
812 	if (hdr->tcp_len < sizeof(*hdr->tcp))
813 		return XDP_ABORTED;
814 
815 	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
816 			       syncookie_handle_ack(hdr);
817 }
818 
819 SEC("xdp")
820 int syncookie_xdp(struct xdp_md *ctx)
821 {
822 	void *data_end = (void *)(long)ctx->data_end;
823 	void *data = (void *)(long)ctx->data;
824 	struct header_pointers hdr;
825 	int ret;
826 
827 	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
828 	if (ret != XDP_TX)
829 		return ret;
830 
831 	data_end = (void *)(long)ctx->data_end;
832 	data = (void *)(long)ctx->data;
833 
834 	return syncookie_part2(ctx, data, data_end, &hdr, true);
835 }
836 
837 SEC("tc")
838 int syncookie_tc(struct __sk_buff *skb)
839 {
840 	void *data_end = (void *)(long)skb->data_end;
841 	void *data = (void *)(long)skb->data;
842 	struct header_pointers hdr;
843 	int ret;
844 
845 	ret = syncookie_part1(skb, data, data_end, &hdr, false);
846 	if (ret != XDP_TX)
847 		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
848 
849 	data_end = (void *)(long)skb->data_end;
850 	data = (void *)(long)skb->data;
851 
852 	ret = syncookie_part2(skb, data, data_end, &hdr, false);
853 	switch (ret) {
854 	case XDP_PASS:
855 		return TC_ACT_OK;
856 	case XDP_TX:
857 		return bpf_redirect(skb->ifindex, 0);
858 	default:
859 		return TC_ACT_SHOT;
860 	}
861 }
862 
863 char _license[] SEC("license") = "GPL";
864