xref: /linux/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c (revision 3652117f854819a148ff0fbe4492587d3520b5e5)
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #include "vmlinux.h"
5 
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_endian.h>
8 #include <asm/errno.h>
9 
10 #define TC_ACT_OK 0
11 #define TC_ACT_SHOT 2
12 
13 #define NSEC_PER_SEC 1000000000L
14 
15 #define ETH_ALEN 6
16 #define ETH_P_IP 0x0800
17 #define ETH_P_IPV6 0x86DD
18 
19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
20 
21 #define IP_DF 0x4000
22 #define IP_MF 0x2000
23 #define IP_OFFSET 0x1fff
24 
25 #define NEXTHDR_TCP 6
26 
27 #define TCPOPT_NOP 1
28 #define TCPOPT_EOL 0
29 #define TCPOPT_MSS 2
30 #define TCPOPT_WINDOW 3
31 #define TCPOPT_SACK_PERM 4
32 #define TCPOPT_TIMESTAMP 8
33 
34 #define TCPOLEN_MSS 4
35 #define TCPOLEN_WINDOW 3
36 #define TCPOLEN_SACK_PERM 2
37 #define TCPOLEN_TIMESTAMP 10
38 
39 #define TCP_TS_HZ 1000
40 #define TS_OPT_WSCALE_MASK 0xf
41 #define TS_OPT_SACK (1 << 4)
42 #define TS_OPT_ECN (1 << 5)
43 #define TSBITS 6
44 #define TSMASK (((__u32)1 << TSBITS) - 1)
45 #define TCP_MAX_WSCALE 14U
46 
47 #define IPV4_MAXLEN 60
48 #define TCP_MAXLEN 60
49 
50 #define DEFAULT_MSS4 1460
51 #define DEFAULT_MSS6 1440
52 #define DEFAULT_WSCALE 7
53 #define DEFAULT_TTL 64
54 #define MAX_ALLOWED_PORTS 8
55 
56 #define swap(a, b) \
57 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
58 
59 #define __get_unaligned_t(type, ptr) ({						\
60 	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
61 	__pptr->x;								\
62 })
63 
64 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
65 
66 struct {
67 	__uint(type, BPF_MAP_TYPE_ARRAY);
68 	__type(key, __u32);
69 	__type(value, __u64);
70 	__uint(max_entries, 2);
71 } values SEC(".maps");
72 
73 struct {
74 	__uint(type, BPF_MAP_TYPE_ARRAY);
75 	__type(key, __u32);
76 	__type(value, __u16);
77 	__uint(max_entries, MAX_ALLOWED_PORTS);
78 } allowed_ports SEC(".maps");
79 
80 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
81  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
82  */
83 
84 struct bpf_ct_opts___local {
85 	s32 netns_id;
86 	s32 error;
87 	u8 l4proto;
88 	u8 dir;
89 	u8 reserved[2];
90 } __attribute__((preserve_access_index));
91 
92 #define BPF_F_CURRENT_NETNS (-1)
93 
94 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
95 					 struct bpf_sock_tuple *bpf_tuple,
96 					 __u32 len_tuple,
97 					 struct bpf_ct_opts___local *opts,
98 					 __u32 len_opts) __ksym;
99 
100 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
101 					 struct bpf_sock_tuple *bpf_tuple,
102 					 u32 len_tuple,
103 					 struct bpf_ct_opts___local *opts,
104 					 u32 len_opts) __ksym;
105 
106 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
107 
108 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
109 {
110 	__u8 tmp[ETH_ALEN];
111 
112 	__builtin_memcpy(tmp, a, ETH_ALEN);
113 	__builtin_memcpy(a, b, ETH_ALEN);
114 	__builtin_memcpy(b, tmp, ETH_ALEN);
115 }
116 
117 static __always_inline __u16 csum_fold(__u32 csum)
118 {
119 	csum = (csum & 0xffff) + (csum >> 16);
120 	csum = (csum & 0xffff) + (csum >> 16);
121 	return (__u16)~csum;
122 }
123 
124 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
125 					       __u32 len, __u8 proto,
126 					       __u32 csum)
127 {
128 	__u64 s = csum;
129 
130 	s += (__u32)saddr;
131 	s += (__u32)daddr;
132 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
133 	s += proto + len;
134 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
135 	s += (proto + len) << 8;
136 #else
137 #error Unknown endian
138 #endif
139 	s = (s & 0xffffffff) + (s >> 32);
140 	s = (s & 0xffffffff) + (s >> 32);
141 
142 	return csum_fold((__u32)s);
143 }
144 
145 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
146 					     const struct in6_addr *daddr,
147 					     __u32 len, __u8 proto, __u32 csum)
148 {
149 	__u64 sum = csum;
150 	int i;
151 
152 #pragma unroll
153 	for (i = 0; i < 4; i++)
154 		sum += (__u32)saddr->in6_u.u6_addr32[i];
155 
156 #pragma unroll
157 	for (i = 0; i < 4; i++)
158 		sum += (__u32)daddr->in6_u.u6_addr32[i];
159 
160 	/* Don't combine additions to avoid 32-bit overflow. */
161 	sum += bpf_htonl(len);
162 	sum += bpf_htonl(proto);
163 
164 	sum = (sum & 0xffffffff) + (sum >> 32);
165 	sum = (sum & 0xffffffff) + (sum >> 32);
166 
167 	return csum_fold((__u32)sum);
168 }
169 
170 static __always_inline __u64 tcp_clock_ns(void)
171 {
172 	return bpf_ktime_get_ns();
173 }
174 
175 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
176 {
177 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
178 }
179 
180 static __always_inline __u32 tcp_clock_ms(void)
181 {
182 	return tcp_ns_to_ts(tcp_clock_ns());
183 }
184 
185 struct tcpopt_context {
186 	__u8 *ptr;
187 	__u8 *end;
188 	void *data_end;
189 	__be32 *tsecr;
190 	__u8 wscale;
191 	bool option_timestamp;
192 	bool option_sack;
193 };
194 
195 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
196 {
197 	__u8 opcode, opsize;
198 
199 	if (ctx->ptr >= ctx->end)
200 		return 1;
201 	if (ctx->ptr >= ctx->data_end)
202 		return 1;
203 
204 	opcode = ctx->ptr[0];
205 
206 	if (opcode == TCPOPT_EOL)
207 		return 1;
208 	if (opcode == TCPOPT_NOP) {
209 		++ctx->ptr;
210 		return 0;
211 	}
212 
213 	if (ctx->ptr + 1 >= ctx->end)
214 		return 1;
215 	if (ctx->ptr + 1 >= ctx->data_end)
216 		return 1;
217 	opsize = ctx->ptr[1];
218 	if (opsize < 2)
219 		return 1;
220 
221 	if (ctx->ptr + opsize > ctx->end)
222 		return 1;
223 
224 	switch (opcode) {
225 	case TCPOPT_WINDOW:
226 		if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
227 			ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
228 		break;
229 	case TCPOPT_TIMESTAMP:
230 		if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
231 			ctx->option_timestamp = true;
232 			/* Client's tsval becomes our tsecr. */
233 			*ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
234 		}
235 		break;
236 	case TCPOPT_SACK_PERM:
237 		if (opsize == TCPOLEN_SACK_PERM)
238 			ctx->option_sack = true;
239 		break;
240 	}
241 
242 	ctx->ptr += opsize;
243 
244 	return 0;
245 }
246 
247 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
248 {
249 	int i;
250 
251 	for (i = 0; i < 7; i++)
252 		if (tscookie_tcpopt_parse(context))
253 			return 1;
254 	return 0;
255 }
256 
257 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
258 					  __u16 tcp_len, __be32 *tsval,
259 					  __be32 *tsecr, void *data_end)
260 {
261 	struct tcpopt_context loop_ctx = {
262 		.ptr = (__u8 *)(tcp_header + 1),
263 		.end = (__u8 *)tcp_header + tcp_len,
264 		.data_end = data_end,
265 		.tsecr = tsecr,
266 		.wscale = TS_OPT_WSCALE_MASK,
267 		.option_timestamp = false,
268 		.option_sack = false,
269 	};
270 	u32 cookie;
271 
272 	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
273 
274 	if (!loop_ctx.option_timestamp)
275 		return false;
276 
277 	cookie = tcp_clock_ms() & ~TSMASK;
278 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
279 	if (loop_ctx.option_sack)
280 		cookie |= TS_OPT_SACK;
281 	if (tcp_header->ece && tcp_header->cwr)
282 		cookie |= TS_OPT_ECN;
283 	*tsval = bpf_htonl(cookie);
284 
285 	return true;
286 }
287 
288 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
289 						 __u8 *ttl, bool ipv6)
290 {
291 	__u32 key = 0;
292 	__u64 *value;
293 
294 	value = bpf_map_lookup_elem(&values, &key);
295 	if (value && *value != 0) {
296 		if (ipv6)
297 			*mss = (*value >> 32) & 0xffff;
298 		else
299 			*mss = *value & 0xffff;
300 		*wscale = (*value >> 16) & 0xf;
301 		*ttl = (*value >> 24) & 0xff;
302 		return;
303 	}
304 
305 	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
306 	*wscale = DEFAULT_WSCALE;
307 	*ttl = DEFAULT_TTL;
308 }
309 
310 static __always_inline void values_inc_synacks(void)
311 {
312 	__u32 key = 1;
313 	__u64 *value;
314 
315 	value = bpf_map_lookup_elem(&values, &key);
316 	if (value)
317 		__sync_fetch_and_add(value, 1);
318 }
319 
320 static __always_inline bool check_port_allowed(__u16 port)
321 {
322 	__u32 i;
323 
324 	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
325 		__u32 key = i;
326 		__u16 *value;
327 
328 		value = bpf_map_lookup_elem(&allowed_ports, &key);
329 
330 		if (!value)
331 			break;
332 		/* 0 is a terminator value. Check it first to avoid matching on
333 		 * a forbidden port == 0 and returning true.
334 		 */
335 		if (*value == 0)
336 			break;
337 
338 		if (*value == port)
339 			return true;
340 	}
341 
342 	return false;
343 }
344 
345 struct header_pointers {
346 	struct ethhdr *eth;
347 	struct iphdr *ipv4;
348 	struct ipv6hdr *ipv6;
349 	struct tcphdr *tcp;
350 	__u16 tcp_len;
351 };
352 
353 static __always_inline int tcp_dissect(void *data, void *data_end,
354 				       struct header_pointers *hdr)
355 {
356 	hdr->eth = data;
357 	if (hdr->eth + 1 > data_end)
358 		return XDP_DROP;
359 
360 	switch (bpf_ntohs(hdr->eth->h_proto)) {
361 	case ETH_P_IP:
362 		hdr->ipv6 = NULL;
363 
364 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
365 		if (hdr->ipv4 + 1 > data_end)
366 			return XDP_DROP;
367 		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
368 			return XDP_DROP;
369 		if (hdr->ipv4->version != 4)
370 			return XDP_DROP;
371 
372 		if (hdr->ipv4->protocol != IPPROTO_TCP)
373 			return XDP_PASS;
374 
375 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
376 		break;
377 	case ETH_P_IPV6:
378 		hdr->ipv4 = NULL;
379 
380 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
381 		if (hdr->ipv6 + 1 > data_end)
382 			return XDP_DROP;
383 		if (hdr->ipv6->version != 6)
384 			return XDP_DROP;
385 
386 		/* XXX: Extension headers are not supported and could circumvent
387 		 * XDP SYN flood protection.
388 		 */
389 		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
390 			return XDP_PASS;
391 
392 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
393 		break;
394 	default:
395 		/* XXX: VLANs will circumvent XDP SYN flood protection. */
396 		return XDP_PASS;
397 	}
398 
399 	if (hdr->tcp + 1 > data_end)
400 		return XDP_DROP;
401 	hdr->tcp_len = hdr->tcp->doff * 4;
402 	if (hdr->tcp_len < sizeof(*hdr->tcp))
403 		return XDP_DROP;
404 
405 	return XDP_TX;
406 }
407 
408 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
409 {
410 	struct bpf_ct_opts___local ct_lookup_opts = {
411 		.netns_id = BPF_F_CURRENT_NETNS,
412 		.l4proto = IPPROTO_TCP,
413 	};
414 	struct bpf_sock_tuple tup = {};
415 	struct nf_conn *ct;
416 	__u32 tup_size;
417 
418 	if (hdr->ipv4) {
419 		/* TCP doesn't normally use fragments, and XDP can't reassemble
420 		 * them.
421 		 */
422 		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
423 			return XDP_DROP;
424 
425 		tup.ipv4.saddr = hdr->ipv4->saddr;
426 		tup.ipv4.daddr = hdr->ipv4->daddr;
427 		tup.ipv4.sport = hdr->tcp->source;
428 		tup.ipv4.dport = hdr->tcp->dest;
429 		tup_size = sizeof(tup.ipv4);
430 	} else if (hdr->ipv6) {
431 		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
432 		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
433 		tup.ipv6.sport = hdr->tcp->source;
434 		tup.ipv6.dport = hdr->tcp->dest;
435 		tup_size = sizeof(tup.ipv6);
436 	} else {
437 		/* The verifier can't track that either ipv4 or ipv6 is not
438 		 * NULL.
439 		 */
440 		return XDP_ABORTED;
441 	}
442 	if (xdp)
443 		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
444 	else
445 		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
446 	if (ct) {
447 		unsigned long status = ct->status;
448 
449 		bpf_ct_release(ct);
450 		if (status & IPS_CONFIRMED_BIT)
451 			return XDP_PASS;
452 	} else if (ct_lookup_opts.error != -ENOENT) {
453 		return XDP_ABORTED;
454 	}
455 
456 	/* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */
457 	return XDP_TX;
458 }
459 
460 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
461 					  __u8 wscale)
462 {
463 	__be32 *start = buf;
464 
465 	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
466 
467 	if (!tsopt)
468 		return buf - start;
469 
470 	if (tsopt[0] & bpf_htonl(1 << 4))
471 		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
472 				   (TCPOLEN_SACK_PERM << 16) |
473 				   (TCPOPT_TIMESTAMP << 8) |
474 				   TCPOLEN_TIMESTAMP);
475 	else
476 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
477 				   (TCPOPT_NOP << 16) |
478 				   (TCPOPT_TIMESTAMP << 8) |
479 				   TCPOLEN_TIMESTAMP);
480 	*buf++ = tsopt[0];
481 	*buf++ = tsopt[1];
482 
483 	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
484 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
485 				   (TCPOPT_WINDOW << 16) |
486 				   (TCPOLEN_WINDOW << 8) |
487 				   wscale);
488 
489 	return buf - start;
490 }
491 
492 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
493 					   __u32 cookie, __be32 *tsopt,
494 					   __u16 mss, __u8 wscale)
495 {
496 	void *tcp_options;
497 
498 	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
499 	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
500 		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
501 	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
502 	swap(tcp_header->source, tcp_header->dest);
503 	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
504 	tcp_header->seq = bpf_htonl(cookie);
505 	tcp_header->window = 0;
506 	tcp_header->urg_ptr = 0;
507 	tcp_header->check = 0; /* Calculate checksum later. */
508 
509 	tcp_options = (void *)(tcp_header + 1);
510 	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
511 }
512 
513 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
514 					     __u32 cookie, __be32 *tsopt)
515 {
516 	__u8 wscale;
517 	__u16 mss;
518 	__u8 ttl;
519 
520 	values_get_tcpipopts(&mss, &wscale, &ttl, false);
521 
522 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
523 
524 	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
525 	hdr->ipv4->check = 0; /* Calculate checksum later. */
526 	hdr->ipv4->tos = 0;
527 	hdr->ipv4->id = 0;
528 	hdr->ipv4->ttl = ttl;
529 
530 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
531 
532 	hdr->tcp_len = hdr->tcp->doff * 4;
533 	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
534 }
535 
536 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
537 					     __u32 cookie, __be32 *tsopt)
538 {
539 	__u8 wscale;
540 	__u16 mss;
541 	__u8 ttl;
542 
543 	values_get_tcpipopts(&mss, &wscale, &ttl, true);
544 
545 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546 
547 	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
548 	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
549 	hdr->ipv6->hop_limit = ttl;
550 
551 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
552 
553 	hdr->tcp_len = hdr->tcp->doff * 4;
554 	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
555 }
556 
557 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
558 						void *ctx,
559 						void *data, void *data_end,
560 						bool xdp)
561 {
562 	__u32 old_pkt_size, new_pkt_size;
563 	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
564 	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
565 	 * the pointer value and use it directly, otherwise tcp_mkoptions is
566 	 * (mis)compiled like this:
567 	 *   if (!tsopt)
568 	 *       return buf - start;
569 	 *   reg = stored_return_value_of_tscookie_init;
570 	 *   if (reg)
571 	 *       tsopt = tsopt_buf;
572 	 *   else
573 	 *       tsopt = NULL;
574 	 *   ...
575 	 *   *buf++ = tsopt[1];
576 	 * It creates a dead branch where tsopt is assigned NULL, but the
577 	 * verifier can't prove it's dead and blocks the program.
578 	 */
579 	__be32 * volatile tsopt = NULL;
580 	__be32 tsopt_buf[2] = {};
581 	__u16 ip_len;
582 	__u32 cookie;
583 	__s64 value;
584 
585 	/* Checksum is not yet verified, but both checksum failure and TCP
586 	 * header checks return XDP_DROP, so the order doesn't matter.
587 	 */
588 	if (hdr->tcp->fin || hdr->tcp->rst)
589 		return XDP_DROP;
590 
591 	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
592 	 * ports.
593 	 */
594 	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
595 		return XDP_DROP;
596 
597 	if (hdr->ipv4) {
598 		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
599 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
600 		if (value < 0)
601 			return XDP_ABORTED;
602 		if (csum_fold(value) != 0)
603 			return XDP_DROP; /* Bad IPv4 checksum. */
604 
605 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
606 		if (value < 0)
607 			return XDP_ABORTED;
608 		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
609 				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
610 			return XDP_DROP; /* Bad TCP checksum. */
611 
612 		ip_len = sizeof(*hdr->ipv4);
613 
614 		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
615 						       hdr->tcp_len);
616 	} else if (hdr->ipv6) {
617 		/* Check the TCP checksum before creating a SYNACK. */
618 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
619 		if (value < 0)
620 			return XDP_ABORTED;
621 		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
622 				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
623 			return XDP_DROP; /* Bad TCP checksum. */
624 
625 		ip_len = sizeof(*hdr->ipv6);
626 
627 		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
628 						       hdr->tcp_len);
629 	} else {
630 		return XDP_ABORTED;
631 	}
632 
633 	if (value < 0)
634 		return XDP_ABORTED;
635 	cookie = (__u32)value;
636 
637 	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
638 			  &tsopt_buf[0], &tsopt_buf[1], data_end))
639 		tsopt = tsopt_buf;
640 
641 	/* Check that there is enough space for a SYNACK. It also covers
642 	 * the check that the destination of the __builtin_memmove below
643 	 * doesn't overflow.
644 	 */
645 	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
646 		return XDP_ABORTED;
647 
648 	if (hdr->ipv4) {
649 		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
650 			struct tcphdr *new_tcp_header;
651 
652 			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
653 			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
654 			hdr->tcp = new_tcp_header;
655 
656 			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
657 		}
658 
659 		tcpv4_gen_synack(hdr, cookie, tsopt);
660 	} else if (hdr->ipv6) {
661 		tcpv6_gen_synack(hdr, cookie, tsopt);
662 	} else {
663 		return XDP_ABORTED;
664 	}
665 
666 	/* Recalculate checksums. */
667 	hdr->tcp->check = 0;
668 	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
669 	if (value < 0)
670 		return XDP_ABORTED;
671 	if (hdr->ipv4) {
672 		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
673 						    hdr->ipv4->daddr,
674 						    hdr->tcp_len,
675 						    IPPROTO_TCP,
676 						    value);
677 
678 		hdr->ipv4->check = 0;
679 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
680 		if (value < 0)
681 			return XDP_ABORTED;
682 		hdr->ipv4->check = csum_fold(value);
683 	} else if (hdr->ipv6) {
684 		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
685 						  &hdr->ipv6->daddr,
686 						  hdr->tcp_len,
687 						  IPPROTO_TCP,
688 						  value);
689 	} else {
690 		return XDP_ABORTED;
691 	}
692 
693 	/* Set the new packet size. */
694 	old_pkt_size = data_end - data;
695 	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
696 	if (xdp) {
697 		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
698 			return XDP_ABORTED;
699 	} else {
700 		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
701 			return XDP_ABORTED;
702 	}
703 
704 	values_inc_synacks();
705 
706 	return XDP_TX;
707 }
708 
709 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
710 {
711 	int err;
712 
713 	if (hdr->tcp->rst)
714 		return XDP_DROP;
715 
716 	if (hdr->ipv4)
717 		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
718 	else if (hdr->ipv6)
719 		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
720 	else
721 		return XDP_ABORTED;
722 	if (err)
723 		return XDP_DROP;
724 
725 	return XDP_PASS;
726 }
727 
728 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
729 					   struct header_pointers *hdr, bool xdp)
730 {
731 	int ret;
732 
733 	ret = tcp_dissect(data, data_end, hdr);
734 	if (ret != XDP_TX)
735 		return ret;
736 
737 	ret = tcp_lookup(ctx, hdr, xdp);
738 	if (ret != XDP_TX)
739 		return ret;
740 
741 	/* Packet is TCP and doesn't belong to an established connection. */
742 
743 	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
744 		return XDP_DROP;
745 
746 	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
747 	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
748 	 */
749 	if (xdp) {
750 		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
751 			return XDP_ABORTED;
752 	} else {
753 		/* Without volatile the verifier throws this error:
754 		 * R9 32-bit pointer arithmetic prohibited
755 		 */
756 		volatile u64 old_len = data_end - data;
757 
758 		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
759 			return XDP_ABORTED;
760 	}
761 
762 	return XDP_TX;
763 }
764 
765 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
766 					   struct header_pointers *hdr, bool xdp)
767 {
768 	if (hdr->ipv4) {
769 		hdr->eth = data;
770 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
771 		/* IPV4_MAXLEN is needed when calculating checksum.
772 		 * At least sizeof(struct iphdr) is needed here to access ihl.
773 		 */
774 		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
775 			return XDP_ABORTED;
776 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
777 	} else if (hdr->ipv6) {
778 		hdr->eth = data;
779 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
780 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
781 	} else {
782 		return XDP_ABORTED;
783 	}
784 
785 	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
786 		return XDP_ABORTED;
787 
788 	/* We run out of registers, tcp_len gets spilled to the stack, and the
789 	 * verifier forgets its min and max values checked above in tcp_dissect.
790 	 */
791 	hdr->tcp_len = hdr->tcp->doff * 4;
792 	if (hdr->tcp_len < sizeof(*hdr->tcp))
793 		return XDP_ABORTED;
794 
795 	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
796 			       syncookie_handle_ack(hdr);
797 }
798 
799 SEC("xdp")
800 int syncookie_xdp(struct xdp_md *ctx)
801 {
802 	void *data_end = (void *)(long)ctx->data_end;
803 	void *data = (void *)(long)ctx->data;
804 	struct header_pointers hdr;
805 	int ret;
806 
807 	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
808 	if (ret != XDP_TX)
809 		return ret;
810 
811 	data_end = (void *)(long)ctx->data_end;
812 	data = (void *)(long)ctx->data;
813 
814 	return syncookie_part2(ctx, data, data_end, &hdr, true);
815 }
816 
817 SEC("tc")
818 int syncookie_tc(struct __sk_buff *skb)
819 {
820 	void *data_end = (void *)(long)skb->data_end;
821 	void *data = (void *)(long)skb->data;
822 	struct header_pointers hdr;
823 	int ret;
824 
825 	ret = syncookie_part1(skb, data, data_end, &hdr, false);
826 	if (ret != XDP_TX)
827 		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
828 
829 	data_end = (void *)(long)skb->data_end;
830 	data = (void *)(long)skb->data;
831 
832 	ret = syncookie_part2(skb, data, data_end, &hdr, false);
833 	switch (ret) {
834 	case XDP_PASS:
835 		return TC_ACT_OK;
836 	case XDP_TX:
837 		return bpf_redirect(skb->ifindex, 0);
838 	default:
839 		return TC_ACT_SHOT;
840 	}
841 }
842 
843 char _license[] SEC("license") = "GPL";
844