xref: /linux/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #define BPF_NO_KFUNC_PROTOTYPES
5 #include "vmlinux.h"
6 
7 #include <bpf/bpf_helpers.h>
8 #include <bpf/bpf_endian.h>
9 #include <asm/errno.h>
10 
11 #include "bpf_compiler.h"
12 
13 #define TC_ACT_OK 0
14 #define TC_ACT_SHOT 2
15 
16 #define NSEC_PER_SEC 1000000000L
17 
18 #define ETH_ALEN 6
19 #define ETH_P_IP 0x0800
20 #define ETH_P_IPV6 0x86DD
21 
22 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
23 
24 #define IP_DF 0x4000
25 #define IP_MF 0x2000
26 #define IP_OFFSET 0x1fff
27 
28 #define NEXTHDR_TCP 6
29 
30 #define TCPOPT_NOP 1
31 #define TCPOPT_EOL 0
32 #define TCPOPT_MSS 2
33 #define TCPOPT_WINDOW 3
34 #define TCPOPT_SACK_PERM 4
35 #define TCPOPT_TIMESTAMP 8
36 
37 #define TCPOLEN_MSS 4
38 #define TCPOLEN_WINDOW 3
39 #define TCPOLEN_SACK_PERM 2
40 #define TCPOLEN_TIMESTAMP 10
41 
42 #define TCP_TS_HZ 1000
43 #define TS_OPT_WSCALE_MASK 0xf
44 #define TS_OPT_SACK (1 << 4)
45 #define TS_OPT_ECN (1 << 5)
46 #define TSBITS 6
47 #define TSMASK (((__u32)1 << TSBITS) - 1)
48 #define TCP_MAX_WSCALE 14U
49 
50 #define IPV4_MAXLEN 60
51 #define TCP_MAXLEN 60
52 
53 #define DEFAULT_MSS4 1460
54 #define DEFAULT_MSS6 1440
55 #define DEFAULT_WSCALE 7
56 #define DEFAULT_TTL 64
57 #define MAX_ALLOWED_PORTS 8
58 
59 #define MAX_PACKET_OFF 0xffff
60 
61 #define swap(a, b) \
62 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
63 
64 #define __get_unaligned_t(type, ptr) ({						\
65 	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
66 	__pptr->x;								\
67 })
68 
69 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
70 
71 struct {
72 	__uint(type, BPF_MAP_TYPE_ARRAY);
73 	__type(key, __u32);
74 	__type(value, __u64);
75 	__uint(max_entries, 2);
76 } values SEC(".maps");
77 
78 struct {
79 	__uint(type, BPF_MAP_TYPE_ARRAY);
80 	__type(key, __u32);
81 	__type(value, __u16);
82 	__uint(max_entries, MAX_ALLOWED_PORTS);
83 } allowed_ports SEC(".maps");
84 
85 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
86  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
87  */
88 
89 struct bpf_ct_opts___local {
90 	s32 netns_id;
91 	s32 error;
92 	u8 l4proto;
93 	u8 dir;
94 	u8 reserved[2];
95 } __attribute__((preserve_access_index));
96 
97 #define BPF_F_CURRENT_NETNS (-1)
98 
99 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
100 					 struct bpf_sock_tuple *bpf_tuple,
101 					 __u32 len_tuple,
102 					 struct bpf_ct_opts___local *opts,
103 					 __u32 len_opts) __ksym;
104 
105 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
106 					 struct bpf_sock_tuple *bpf_tuple,
107 					 u32 len_tuple,
108 					 struct bpf_ct_opts___local *opts,
109 					 u32 len_opts) __ksym;
110 
111 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
112 
113 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
114 {
115 	__u8 tmp[ETH_ALEN];
116 
117 	__builtin_memcpy(tmp, a, ETH_ALEN);
118 	__builtin_memcpy(a, b, ETH_ALEN);
119 	__builtin_memcpy(b, tmp, ETH_ALEN);
120 }
121 
122 static __always_inline __u16 csum_fold(__u32 csum)
123 {
124 	csum = (csum & 0xffff) + (csum >> 16);
125 	csum = (csum & 0xffff) + (csum >> 16);
126 	return (__u16)~csum;
127 }
128 
129 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
130 					       __u32 len, __u8 proto,
131 					       __u32 csum)
132 {
133 	__u64 s = csum;
134 
135 	s += (__u32)saddr;
136 	s += (__u32)daddr;
137 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
138 	s += proto + len;
139 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
140 	s += (proto + len) << 8;
141 #else
142 #error Unknown endian
143 #endif
144 	s = (s & 0xffffffff) + (s >> 32);
145 	s = (s & 0xffffffff) + (s >> 32);
146 
147 	return csum_fold((__u32)s);
148 }
149 
150 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
151 					     const struct in6_addr *daddr,
152 					     __u32 len, __u8 proto, __u32 csum)
153 {
154 	__u64 sum = csum;
155 	int i;
156 
157 	__pragma_loop_unroll
158 	for (i = 0; i < 4; i++)
159 		sum += (__u32)saddr->in6_u.u6_addr32[i];
160 
161 	__pragma_loop_unroll
162 	for (i = 0; i < 4; i++)
163 		sum += (__u32)daddr->in6_u.u6_addr32[i];
164 
165 	/* Don't combine additions to avoid 32-bit overflow. */
166 	sum += bpf_htonl(len);
167 	sum += bpf_htonl(proto);
168 
169 	sum = (sum & 0xffffffff) + (sum >> 32);
170 	sum = (sum & 0xffffffff) + (sum >> 32);
171 
172 	return csum_fold((__u32)sum);
173 }
174 
175 static __always_inline __u64 tcp_clock_ns(void)
176 {
177 	return bpf_ktime_get_ns();
178 }
179 
180 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
181 {
182 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
183 }
184 
185 static __always_inline __u32 tcp_clock_ms(void)
186 {
187 	return tcp_ns_to_ts(tcp_clock_ns());
188 }
189 
190 struct tcpopt_context {
191 	void *data;
192 	void *data_end;
193 	__be32 *tsecr;
194 	__u8 wscale;
195 	bool option_timestamp;
196 	bool option_sack;
197 	__u32 off;
198 };
199 
200 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
201 {
202 	__u64 off = ctx->off;
203 	__u8 *data;
204 
205 	/* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
206 	if (off > MAX_PACKET_OFF - sz)
207 		return NULL;
208 
209 	data = ctx->data + off;
210 	barrier_var(data);
211 	if (data + sz >= ctx->data_end)
212 		return NULL;
213 
214 	ctx->off += sz;
215 	return data;
216 }
217 
218 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
219 {
220 	__u8 *opcode, *opsize, *wscale, *tsecr;
221 	__u32 off = ctx->off;
222 
223 	opcode = next(ctx, 1);
224 	if (!opcode)
225 		return 1;
226 
227 	if (*opcode == TCPOPT_EOL)
228 		return 1;
229 	if (*opcode == TCPOPT_NOP)
230 		return 0;
231 
232 	opsize = next(ctx, 1);
233 	if (!opsize || *opsize < 2)
234 		return 1;
235 
236 	switch (*opcode) {
237 	case TCPOPT_WINDOW:
238 		wscale = next(ctx, 1);
239 		if (!wscale)
240 			return 1;
241 		if (*opsize == TCPOLEN_WINDOW)
242 			ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
243 		break;
244 	case TCPOPT_TIMESTAMP:
245 		tsecr = next(ctx, 4);
246 		if (!tsecr)
247 			return 1;
248 		if (*opsize == TCPOLEN_TIMESTAMP) {
249 			ctx->option_timestamp = true;
250 			/* Client's tsval becomes our tsecr. */
251 			*ctx->tsecr = get_unaligned((__be32 *)tsecr);
252 		}
253 		break;
254 	case TCPOPT_SACK_PERM:
255 		if (*opsize == TCPOLEN_SACK_PERM)
256 			ctx->option_sack = true;
257 		break;
258 	}
259 
260 	ctx->off = off + *opsize;
261 
262 	return 0;
263 }
264 
265 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
266 {
267 	int i;
268 
269 	for (i = 0; i < 7; i++)
270 		if (tscookie_tcpopt_parse(context))
271 			return 1;
272 	return 0;
273 }
274 
275 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
276 					  __u16 tcp_len, __be32 *tsval,
277 					  __be32 *tsecr, void *data, void *data_end)
278 {
279 	struct tcpopt_context loop_ctx = {
280 		.data = data,
281 		.data_end = data_end,
282 		.tsecr = tsecr,
283 		.wscale = TS_OPT_WSCALE_MASK,
284 		.option_timestamp = false,
285 		.option_sack = false,
286 		/* Note: currently verifier would track .off as unbound scalar.
287 		 *       In case if verifier would at some point get smarter and
288 		 *       compute bounded value for this var, beware that it might
289 		 *       hinder bpf_loop() convergence validation.
290 		 */
291 		.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
292 	};
293 	u32 cookie;
294 
295 	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
296 
297 	if (!loop_ctx.option_timestamp)
298 		return false;
299 
300 	cookie = tcp_clock_ms() & ~TSMASK;
301 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
302 	if (loop_ctx.option_sack)
303 		cookie |= TS_OPT_SACK;
304 	if (tcp_header->ece && tcp_header->cwr)
305 		cookie |= TS_OPT_ECN;
306 	*tsval = bpf_htonl(cookie);
307 
308 	return true;
309 }
310 
311 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
312 						 __u8 *ttl, bool ipv6)
313 {
314 	__u32 key = 0;
315 	__u64 *value;
316 
317 	value = bpf_map_lookup_elem(&values, &key);
318 	if (value && *value != 0) {
319 		if (ipv6)
320 			*mss = (*value >> 32) & 0xffff;
321 		else
322 			*mss = *value & 0xffff;
323 		*wscale = (*value >> 16) & 0xf;
324 		*ttl = (*value >> 24) & 0xff;
325 		return;
326 	}
327 
328 	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
329 	*wscale = DEFAULT_WSCALE;
330 	*ttl = DEFAULT_TTL;
331 }
332 
333 static __always_inline void values_inc_synacks(void)
334 {
335 	__u32 key = 1;
336 	__u64 *value;
337 
338 	value = bpf_map_lookup_elem(&values, &key);
339 	if (value)
340 		__sync_fetch_and_add(value, 1);
341 }
342 
343 static __always_inline bool check_port_allowed(__u16 port)
344 {
345 	__u32 i;
346 
347 	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
348 		__u32 key = i;
349 		__u16 *value;
350 
351 		value = bpf_map_lookup_elem(&allowed_ports, &key);
352 
353 		if (!value)
354 			break;
355 		/* 0 is a terminator value. Check it first to avoid matching on
356 		 * a forbidden port == 0 and returning true.
357 		 */
358 		if (*value == 0)
359 			break;
360 
361 		if (*value == port)
362 			return true;
363 	}
364 
365 	return false;
366 }
367 
368 struct header_pointers {
369 	struct ethhdr *eth;
370 	struct iphdr *ipv4;
371 	struct ipv6hdr *ipv6;
372 	struct tcphdr *tcp;
373 	__u16 tcp_len;
374 };
375 
376 static __always_inline int tcp_dissect(void *data, void *data_end,
377 				       struct header_pointers *hdr)
378 {
379 	hdr->eth = data;
380 	if (hdr->eth + 1 > data_end)
381 		return XDP_DROP;
382 
383 	switch (bpf_ntohs(hdr->eth->h_proto)) {
384 	case ETH_P_IP:
385 		hdr->ipv6 = NULL;
386 
387 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
388 		if (hdr->ipv4 + 1 > data_end)
389 			return XDP_DROP;
390 		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
391 			return XDP_DROP;
392 		if (hdr->ipv4->version != 4)
393 			return XDP_DROP;
394 
395 		if (hdr->ipv4->protocol != IPPROTO_TCP)
396 			return XDP_PASS;
397 
398 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
399 		break;
400 	case ETH_P_IPV6:
401 		hdr->ipv4 = NULL;
402 
403 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
404 		if (hdr->ipv6 + 1 > data_end)
405 			return XDP_DROP;
406 		if (hdr->ipv6->version != 6)
407 			return XDP_DROP;
408 
409 		/* XXX: Extension headers are not supported and could circumvent
410 		 * XDP SYN flood protection.
411 		 */
412 		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
413 			return XDP_PASS;
414 
415 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
416 		break;
417 	default:
418 		/* XXX: VLANs will circumvent XDP SYN flood protection. */
419 		return XDP_PASS;
420 	}
421 
422 	if (hdr->tcp + 1 > data_end)
423 		return XDP_DROP;
424 	hdr->tcp_len = hdr->tcp->doff * 4;
425 	if (hdr->tcp_len < sizeof(*hdr->tcp))
426 		return XDP_DROP;
427 
428 	return XDP_TX;
429 }
430 
431 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
432 {
433 	struct bpf_ct_opts___local ct_lookup_opts = {
434 		.netns_id = BPF_F_CURRENT_NETNS,
435 		.l4proto = IPPROTO_TCP,
436 	};
437 	struct bpf_sock_tuple tup = {};
438 	struct nf_conn *ct;
439 	__u32 tup_size;
440 
441 	if (hdr->ipv4) {
442 		/* TCP doesn't normally use fragments, and XDP can't reassemble
443 		 * them.
444 		 */
445 		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
446 			return XDP_DROP;
447 
448 		tup.ipv4.saddr = hdr->ipv4->saddr;
449 		tup.ipv4.daddr = hdr->ipv4->daddr;
450 		tup.ipv4.sport = hdr->tcp->source;
451 		tup.ipv4.dport = hdr->tcp->dest;
452 		tup_size = sizeof(tup.ipv4);
453 	} else if (hdr->ipv6) {
454 		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
455 		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
456 		tup.ipv6.sport = hdr->tcp->source;
457 		tup.ipv6.dport = hdr->tcp->dest;
458 		tup_size = sizeof(tup.ipv6);
459 	} else {
460 		/* The verifier can't track that either ipv4 or ipv6 is not
461 		 * NULL.
462 		 */
463 		return XDP_ABORTED;
464 	}
465 	if (xdp)
466 		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
467 	else
468 		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
469 	if (ct) {
470 		unsigned long status = ct->status;
471 
472 		bpf_ct_release(ct);
473 		if (status & IPS_CONFIRMED)
474 			return XDP_PASS;
475 	} else if (ct_lookup_opts.error != -ENOENT) {
476 		return XDP_ABORTED;
477 	}
478 
479 	/* error == -ENOENT || !(status & IPS_CONFIRMED) */
480 	return XDP_TX;
481 }
482 
483 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
484 					  __u8 wscale)
485 {
486 	__be32 *start = buf;
487 
488 	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
489 
490 	if (!tsopt)
491 		return buf - start;
492 
493 	if (tsopt[0] & bpf_htonl(1 << 4))
494 		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
495 				   (TCPOLEN_SACK_PERM << 16) |
496 				   (TCPOPT_TIMESTAMP << 8) |
497 				   TCPOLEN_TIMESTAMP);
498 	else
499 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
500 				   (TCPOPT_NOP << 16) |
501 				   (TCPOPT_TIMESTAMP << 8) |
502 				   TCPOLEN_TIMESTAMP);
503 	*buf++ = tsopt[0];
504 	*buf++ = tsopt[1];
505 
506 	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
507 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
508 				   (TCPOPT_WINDOW << 16) |
509 				   (TCPOLEN_WINDOW << 8) |
510 				   wscale);
511 
512 	return buf - start;
513 }
514 
515 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
516 					   __u32 cookie, __be32 *tsopt,
517 					   __u16 mss, __u8 wscale)
518 {
519 	void *tcp_options;
520 
521 	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
522 	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
523 		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
524 	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
525 	swap(tcp_header->source, tcp_header->dest);
526 	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
527 	tcp_header->seq = bpf_htonl(cookie);
528 	tcp_header->window = 0;
529 	tcp_header->urg_ptr = 0;
530 	tcp_header->check = 0; /* Calculate checksum later. */
531 
532 	tcp_options = (void *)(tcp_header + 1);
533 	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
534 }
535 
536 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
537 					     __u32 cookie, __be32 *tsopt)
538 {
539 	__u8 wscale;
540 	__u16 mss;
541 	__u8 ttl;
542 
543 	values_get_tcpipopts(&mss, &wscale, &ttl, false);
544 
545 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546 
547 	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
548 	hdr->ipv4->check = 0; /* Calculate checksum later. */
549 	hdr->ipv4->tos = 0;
550 	hdr->ipv4->id = 0;
551 	hdr->ipv4->ttl = ttl;
552 
553 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
554 
555 	hdr->tcp_len = hdr->tcp->doff * 4;
556 	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
557 }
558 
559 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
560 					     __u32 cookie, __be32 *tsopt)
561 {
562 	__u8 wscale;
563 	__u16 mss;
564 	__u8 ttl;
565 
566 	values_get_tcpipopts(&mss, &wscale, &ttl, true);
567 
568 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
569 
570 	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
571 	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
572 	hdr->ipv6->hop_limit = ttl;
573 
574 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
575 
576 	hdr->tcp_len = hdr->tcp->doff * 4;
577 	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
578 }
579 
580 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
581 						void *ctx,
582 						void *data, void *data_end,
583 						bool xdp)
584 {
585 	__u32 old_pkt_size, new_pkt_size;
586 	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
587 	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
588 	 * the pointer value and use it directly, otherwise tcp_mkoptions is
589 	 * (mis)compiled like this:
590 	 *   if (!tsopt)
591 	 *       return buf - start;
592 	 *   reg = stored_return_value_of_tscookie_init;
593 	 *   if (reg)
594 	 *       tsopt = tsopt_buf;
595 	 *   else
596 	 *       tsopt = NULL;
597 	 *   ...
598 	 *   *buf++ = tsopt[1];
599 	 * It creates a dead branch where tsopt is assigned NULL, but the
600 	 * verifier can't prove it's dead and blocks the program.
601 	 */
602 	__be32 * volatile tsopt = NULL;
603 	__be32 tsopt_buf[2] = {};
604 	__u16 ip_len;
605 	__u32 cookie;
606 	__s64 value;
607 
608 	/* Checksum is not yet verified, but both checksum failure and TCP
609 	 * header checks return XDP_DROP, so the order doesn't matter.
610 	 */
611 	if (hdr->tcp->fin || hdr->tcp->rst)
612 		return XDP_DROP;
613 
614 	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
615 	 * ports.
616 	 */
617 	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
618 		return XDP_DROP;
619 
620 	if (hdr->ipv4) {
621 		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
622 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
623 		if (value < 0)
624 			return XDP_ABORTED;
625 		if (csum_fold(value) != 0)
626 			return XDP_DROP; /* Bad IPv4 checksum. */
627 
628 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
629 		if (value < 0)
630 			return XDP_ABORTED;
631 		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
632 				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
633 			return XDP_DROP; /* Bad TCP checksum. */
634 
635 		ip_len = sizeof(*hdr->ipv4);
636 
637 		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
638 						       hdr->tcp_len);
639 	} else if (hdr->ipv6) {
640 		/* Check the TCP checksum before creating a SYNACK. */
641 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
642 		if (value < 0)
643 			return XDP_ABORTED;
644 		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
645 				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
646 			return XDP_DROP; /* Bad TCP checksum. */
647 
648 		ip_len = sizeof(*hdr->ipv6);
649 
650 		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
651 						       hdr->tcp_len);
652 	} else {
653 		return XDP_ABORTED;
654 	}
655 
656 	if (value < 0)
657 		return XDP_ABORTED;
658 	cookie = (__u32)value;
659 
660 	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
661 			  &tsopt_buf[0], &tsopt_buf[1], data, data_end))
662 		tsopt = tsopt_buf;
663 
664 	/* Check that there is enough space for a SYNACK. It also covers
665 	 * the check that the destination of the __builtin_memmove below
666 	 * doesn't overflow.
667 	 */
668 	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
669 		return XDP_ABORTED;
670 
671 	if (hdr->ipv4) {
672 		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
673 			struct tcphdr *new_tcp_header;
674 
675 			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
676 			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
677 			hdr->tcp = new_tcp_header;
678 
679 			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
680 		}
681 
682 		tcpv4_gen_synack(hdr, cookie, tsopt);
683 	} else if (hdr->ipv6) {
684 		tcpv6_gen_synack(hdr, cookie, tsopt);
685 	} else {
686 		return XDP_ABORTED;
687 	}
688 
689 	/* Recalculate checksums. */
690 	hdr->tcp->check = 0;
691 	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
692 	if (value < 0)
693 		return XDP_ABORTED;
694 	if (hdr->ipv4) {
695 		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
696 						    hdr->ipv4->daddr,
697 						    hdr->tcp_len,
698 						    IPPROTO_TCP,
699 						    value);
700 
701 		hdr->ipv4->check = 0;
702 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
703 		if (value < 0)
704 			return XDP_ABORTED;
705 		hdr->ipv4->check = csum_fold(value);
706 	} else if (hdr->ipv6) {
707 		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
708 						  &hdr->ipv6->daddr,
709 						  hdr->tcp_len,
710 						  IPPROTO_TCP,
711 						  value);
712 	} else {
713 		return XDP_ABORTED;
714 	}
715 
716 	/* Set the new packet size. */
717 	old_pkt_size = data_end - data;
718 	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
719 	if (xdp) {
720 		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
721 			return XDP_ABORTED;
722 	} else {
723 		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
724 			return XDP_ABORTED;
725 	}
726 
727 	values_inc_synacks();
728 
729 	return XDP_TX;
730 }
731 
732 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
733 {
734 	int err;
735 
736 	if (hdr->tcp->rst)
737 		return XDP_DROP;
738 
739 	if (hdr->ipv4)
740 		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
741 	else if (hdr->ipv6)
742 		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
743 	else
744 		return XDP_ABORTED;
745 	if (err)
746 		return XDP_DROP;
747 
748 	return XDP_PASS;
749 }
750 
751 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
752 					   struct header_pointers *hdr, bool xdp)
753 {
754 	int ret;
755 
756 	ret = tcp_dissect(data, data_end, hdr);
757 	if (ret != XDP_TX)
758 		return ret;
759 
760 	ret = tcp_lookup(ctx, hdr, xdp);
761 	if (ret != XDP_TX)
762 		return ret;
763 
764 	/* Packet is TCP and doesn't belong to an established connection. */
765 
766 	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
767 		return XDP_DROP;
768 
769 	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
770 	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
771 	 */
772 	if (xdp) {
773 		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
774 			return XDP_ABORTED;
775 	} else {
776 		/* Without volatile the verifier throws this error:
777 		 * R9 32-bit pointer arithmetic prohibited
778 		 */
779 		volatile u64 old_len = data_end - data;
780 
781 		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
782 			return XDP_ABORTED;
783 	}
784 
785 	return XDP_TX;
786 }
787 
788 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
789 					   struct header_pointers *hdr, bool xdp)
790 {
791 	if (hdr->ipv4) {
792 		hdr->eth = data;
793 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
794 		/* IPV4_MAXLEN is needed when calculating checksum.
795 		 * At least sizeof(struct iphdr) is needed here to access ihl.
796 		 */
797 		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
798 			return XDP_ABORTED;
799 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
800 	} else if (hdr->ipv6) {
801 		hdr->eth = data;
802 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
803 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
804 	} else {
805 		return XDP_ABORTED;
806 	}
807 
808 	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
809 		return XDP_ABORTED;
810 
811 	/* We run out of registers, tcp_len gets spilled to the stack, and the
812 	 * verifier forgets its min and max values checked above in tcp_dissect.
813 	 */
814 	hdr->tcp_len = hdr->tcp->doff * 4;
815 	if (hdr->tcp_len < sizeof(*hdr->tcp))
816 		return XDP_ABORTED;
817 
818 	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
819 			       syncookie_handle_ack(hdr);
820 }
821 
822 SEC("xdp")
823 int syncookie_xdp(struct xdp_md *ctx)
824 {
825 	void *data_end = (void *)(long)ctx->data_end;
826 	void *data = (void *)(long)ctx->data;
827 	struct header_pointers hdr;
828 	int ret;
829 
830 	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
831 	if (ret != XDP_TX)
832 		return ret;
833 
834 	data_end = (void *)(long)ctx->data_end;
835 	data = (void *)(long)ctx->data;
836 
837 	return syncookie_part2(ctx, data, data_end, &hdr, true);
838 }
839 
840 SEC("tc")
841 int syncookie_tc(struct __sk_buff *skb)
842 {
843 	void *data_end = (void *)(long)skb->data_end;
844 	void *data = (void *)(long)skb->data;
845 	struct header_pointers hdr;
846 	int ret;
847 
848 	ret = syncookie_part1(skb, data, data_end, &hdr, false);
849 	if (ret != XDP_TX)
850 		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
851 
852 	data_end = (void *)(long)skb->data_end;
853 	data = (void *)(long)skb->data;
854 
855 	ret = syncookie_part2(skb, data, data_end, &hdr, false);
856 	switch (ret) {
857 	case XDP_PASS:
858 		return TC_ACT_OK;
859 	case XDP_TX:
860 		return bpf_redirect(skb->ifindex, 0);
861 	default:
862 		return TC_ACT_SHOT;
863 	}
864 }
865 
866 char _license[] SEC("license") = "GPL";
867